2009-09-18, 15:29
I really hate BeautifulSoup. Sometimes work great but mostly i don't understand why it isn't working.
I need to print all "title, desc, thumb, video" with this code. But return just first ones.
I need to print all "title, desc, thumb, video" with this code. But return just first ones.
Code:
import urllib2
from BeautifulSoup import BeautifulSoup
CH_ROOT = "http://www.collegehumor.com"
CH_RECENT = "/originals/recent"
CH_VIEWED = "/originals/most-viewed"
CH_LIKED = "/originals/most-liked"
CH_PLAYLIST = "/moogaloop"
def getHTML(url):
try:
print 'common :: getHTML :: url = ' + url
req = urllib2.Request(url)
response = urllib2.urlopen(req)
link = response.read()
response.close()
except urllib2.HTTPError, e:
print "HTTP error: %d" % e.code
except urllib2.URLError, e:
print "Network error: %s" % e.reason.args[1]
else:
return link
html = getHTML(CH_ROOT + CH_RECENT)
soup = BeautifulSoup(html)
for result in soup.findAll("div", id="tab_content_0"):
title = result.findAll("strong", {"class":"title"})[0].a.string.strip()
desc = result.findAll("div", {"class":"linked_details"})[0].p.string.strip()
thumb = result.findAll("img", {"class":"media_thumb"})[0]['src']
video = CH_ROOT + CH_PLAYLIST + result.findAll("a", {"class":"video_link"})[0]['href']
print title, desc, thumb, video