/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Annotation of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 6 - (hide annotations)
Thu Nov 2 05:29:32 2006 UTC (15 years, 1 month ago) by apollock
File MIME type: text/x-python
File size: 2412 byte(s)
Initial commit of IMDb newly released movies scraper/Atom feed generator

1 apollock 6 #!/usr/bin/python
2    
3     from sgmllib import SGMLParser
4     import htmlentitydefs
5     import urllib
6     import time
7     import datetime
8     import xml.sax.saxutils
9    
10     class BaseHTMLProcessor(SGMLParser):
11    
12     def reset(self):
13     self.pieces = []
14     self.include = False
15     self.nestlevel = 0
16     SGMLParser.reset(self)
17    
18     def unknown_starttag(self, tag, attrs):
19     strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
20     if tag == "table":
21     #print "Saw a table"
22     if 'class' in attrs[0] and 'movies' in attrs[0]:
23     self.include = True
24     if self.include:
25     self.nestlevel += 1
26    
27     if self.include:
28     self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
29    
30    
31     def unknown_endtag(self, tag):
32     if self.include:
33     self.pieces.append("</%(tag)s>" % locals())
34     if tag == "table":
35     self.nestlevel -= 1
36     if self.nestlevel == 0:
37     self.include = False
38    
39    
40     def handle_charref(self, ref):
41     if self.include:
42     self.pieces.append("&#%(ref)s;" % locals())
43    
44     def handle_entityref(self, ref):
45     if self.include:
46     self.pieces.append("&%(ref)s" % locals())
47     if htmlentitydefs.entitydefs.has_key(ref):
48     self.pieces.append(";")
49    
50     def handle_data(self, text):
51     if self.include:
52     self.pieces.append(text)
53    
54     def handle_comment(self, text):
55     if self.include:
56     self.pieces.append("<!--%(text)s-->" % locals())
57    
58     def handle_pi(self, text):
59     if self.include:
60     self.pieces.append("<?%(text)s>" % locals())
61    
62     def handle_decl(self, text):
63     if self.include:
64     self.pieces.append("<!%(text)s>" % locals())
65    
66     def output(self):
67     return "".join(self.pieces)
68    
69    
70     def main():
71     parser = BaseHTMLProcessor()
72     #html = open("index.html")
73     html = urllib.urlopen("http://www.imdb.com/nowplaying/")
74     parser.feed(html.read())
75     html.close()
76     movies = xml.sax.saxutils.escape(parser.output())
77     #movies = parser.output()
78     print """<?xml version="1.0" encoding="utf-8"?>
79     <feed xmlns="http://www.w3.org/2005/Atom">
80    
81     <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
82    
83     <title>This week's movies from IMDb</title>
84     <updated>%(updated)sZ</updated>
85     <author>
86     <name>Andrew Pollock</name>
87     </author>
88     <id>http://www.andrew.net.au/</id>
89    
90     <entry>
91     <id>http://www.imdb.com/nowplaying/</id>
92    
93     <updated>%(updated)sZ</updated>
94     <title>This week's movies</title>
95    
96     <content type="html">
97     %(movies)s
98     </content>
99     </entry>
100     </feed>
101     """ % { 'updated': datetime.datetime.utcnow().isoformat(), 'movies': movies }
102    
103     if __name__ == "__main__":
104     main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22