/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Contents of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 6 - (show annotations)
Thu Nov 2 05:29:32 2006 UTC (17 years, 10 months ago) by apollock
File MIME type: text/x-python
File size: 2412 byte(s)
Initial commit of IMDb newly released movies scraper/Atom feed generator

1 #!/usr/bin/python
2
3 from sgmllib import SGMLParser
4 import htmlentitydefs
5 import urllib
6 import time
7 import datetime
8 import xml.sax.saxutils
9
10 class BaseHTMLProcessor(SGMLParser):
11
12 def reset(self):
13 self.pieces = []
14 self.include = False
15 self.nestlevel = 0
16 SGMLParser.reset(self)
17
18 def unknown_starttag(self, tag, attrs):
19 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
20 if tag == "table":
21 #print "Saw a table"
22 if 'class' in attrs[0] and 'movies' in attrs[0]:
23 self.include = True
24 if self.include:
25 self.nestlevel += 1
26
27 if self.include:
28 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
29
30
31 def unknown_endtag(self, tag):
32 if self.include:
33 self.pieces.append("</%(tag)s>" % locals())
34 if tag == "table":
35 self.nestlevel -= 1
36 if self.nestlevel == 0:
37 self.include = False
38
39
40 def handle_charref(self, ref):
41 if self.include:
42 self.pieces.append("&#%(ref)s;" % locals())
43
44 def handle_entityref(self, ref):
45 if self.include:
46 self.pieces.append("&%(ref)s" % locals())
47 if htmlentitydefs.entitydefs.has_key(ref):
48 self.pieces.append(";")
49
50 def handle_data(self, text):
51 if self.include:
52 self.pieces.append(text)
53
54 def handle_comment(self, text):
55 if self.include:
56 self.pieces.append("<!--%(text)s-->" % locals())
57
58 def handle_pi(self, text):
59 if self.include:
60 self.pieces.append("<?%(text)s>" % locals())
61
62 def handle_decl(self, text):
63 if self.include:
64 self.pieces.append("<!%(text)s>" % locals())
65
66 def output(self):
67 return "".join(self.pieces)
68
69
70 def main():
71 parser = BaseHTMLProcessor()
72 #html = open("index.html")
73 html = urllib.urlopen("http://www.imdb.com/nowplaying/")
74 parser.feed(html.read())
75 html.close()
76 movies = xml.sax.saxutils.escape(parser.output())
77 #movies = parser.output()
78 print """<?xml version="1.0" encoding="utf-8"?>
79 <feed xmlns="http://www.w3.org/2005/Atom">
80
81 <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
82
83 <title>This week's movies from IMDb</title>
84 <updated>%(updated)sZ</updated>
85 <author>
86 <name>Andrew Pollock</name>
87 </author>
88 <id>http://www.andrew.net.au/</id>
89
90 <entry>
91 <id>http://www.imdb.com/nowplaying/</id>
92
93 <updated>%(updated)sZ</updated>
94 <title>This week's movies</title>
95
96 <content type="html">
97 %(movies)s
98 </content>
99 </entry>
100 </feed>
101 """ % { 'updated': datetime.datetime.utcnow().isoformat(), 'movies': movies }
102
103 if __name__ == "__main__":
104 main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22