from BeautifulSoup import BeautifulSoup
import urllib2, ClientCookie, pickle, webbrowser
def MakeSoup(pageno):
#fetch a page and make it into delicious soup
urlstart = 'XXXobscuredXXX'
urlend = 'XXXobscuredXXX'
request = urllib2.Request(urlstart+str(pageno)+urlend)
request.add_header('Accept-charset','utf-8,*')
request.add_header('Cookie',"LegalDisclaimer=1")
f = ClientCookie.urlopen(request)
response = f.read()
f.close()
soup.feed(response)
return soup
def ScrapeNumbers():
#scrape the numbers out of the soup
numlist = []
idlist = soup.fetch('div', {'class': 'Label'})
for i in idlist:
s = str(i.contents[0])
start = s.find('MLS')
end = s.find('')
if start > 0:
#got an mlsno
mlsno = s[start+10:end].strip()
#get a property id, too
start = s.find('PropertyID')
end = s.find('">MLS')
propid = s[start+11:end].strip()
numlist.append((mlsno,propid))
return numlist
#########################
# load our MLS number history
try:
mlshistory = pickle.load(open('mlslist.pickle'))
except:
mlshistory = []
# first pass gets the page count
print "Getting page count..."
pageno = 1
soup = BeautifulSoup()
soup = MakeSoup(pageno)
# identify page count
pagelist = soup.first('span', {'class': 'PageHeader'})
s = str(pagelist.contents[0])
start = s.find('of')
end = s.find('-')
pagecount = s[start+2:end].strip()
print "There are "+str(pagecount)+" pages"
# make more soup
while int(pageno) < int(pagecount):br>
print "Processing page "+str(pageno)
pageno += 1
soup = MakeSoup(pageno)
# parse out new numbers
newnumbers = []
numlist = ScrapeNumbers()
for i in numlist:
if i not in mlshistory:
newnumbers.append(i)
print "New Numbers:"
print newnumbers
for i in newnumbers:
mlshistory.append(i)
webbrowser.open('XXXobscuredXXX'+str(i[1]),1)
#save the updated MLS number history
pickle.dump(mlshistory,open('mlslist.pickle','w'))
>
The downloaded files are .csv files which will, in a perfect world, be handed off to an excel macro that will bring them to their actual purpose.
posted by bingo at 7:29 AM on August 8, 2004