1 |
apollock |
6 |
#!/usr/bin/python |
2 |
|
|
|
3 |
|
|
from sgmllib import SGMLParser |
4 |
|
|
import htmlentitydefs |
5 |
|
|
import urllib |
6 |
|
|
import time |
7 |
|
|
import datetime |
8 |
|
|
import xml.sax.saxutils |
9 |
|
|
|
10 |
|
|
class BaseHTMLProcessor(SGMLParser): |
11 |
|
|
|
12 |
|
|
def reset(self): |
13 |
|
|
self.pieces = [] |
14 |
|
|
self.include = False |
15 |
|
|
self.nestlevel = 0 |
16 |
|
|
SGMLParser.reset(self) |
17 |
|
|
|
18 |
|
|
def unknown_starttag(self, tag, attrs): |
19 |
|
|
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) |
20 |
|
|
if tag == "table": |
21 |
|
|
#print "Saw a table" |
22 |
|
|
if 'class' in attrs[0] and 'movies' in attrs[0]: |
23 |
|
|
self.include = True |
24 |
|
|
if self.include: |
25 |
|
|
self.nestlevel += 1 |
26 |
|
|
|
27 |
|
|
if self.include: |
28 |
|
|
self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) |
29 |
|
|
|
30 |
|
|
|
31 |
|
|
def unknown_endtag(self, tag): |
32 |
|
|
if self.include: |
33 |
|
|
self.pieces.append("</%(tag)s>" % locals()) |
34 |
|
|
if tag == "table": |
35 |
|
|
self.nestlevel -= 1 |
36 |
|
|
if self.nestlevel == 0: |
37 |
|
|
self.include = False |
38 |
|
|
|
39 |
|
|
|
40 |
|
|
def handle_charref(self, ref): |
41 |
|
|
if self.include: |
42 |
|
|
self.pieces.append("&#%(ref)s;" % locals()) |
43 |
|
|
|
44 |
|
|
def handle_entityref(self, ref): |
45 |
|
|
if self.include: |
46 |
|
|
self.pieces.append("&%(ref)s" % locals()) |
47 |
|
|
if htmlentitydefs.entitydefs.has_key(ref): |
48 |
|
|
self.pieces.append(";") |
49 |
|
|
|
50 |
|
|
def handle_data(self, text): |
51 |
|
|
if self.include: |
52 |
|
|
self.pieces.append(text) |
53 |
|
|
|
54 |
|
|
def handle_comment(self, text): |
55 |
|
|
if self.include: |
56 |
|
|
self.pieces.append("<!--%(text)s-->" % locals()) |
57 |
|
|
|
58 |
|
|
def handle_pi(self, text): |
59 |
|
|
if self.include: |
60 |
|
|
self.pieces.append("<?%(text)s>" % locals()) |
61 |
|
|
|
62 |
|
|
def handle_decl(self, text): |
63 |
|
|
if self.include: |
64 |
|
|
self.pieces.append("<!%(text)s>" % locals()) |
65 |
|
|
|
66 |
|
|
def output(self): |
67 |
|
|
return "".join(self.pieces) |
68 |
|
|
|
69 |
|
|
|
70 |
|
|
def main(): |
71 |
|
|
parser = BaseHTMLProcessor() |
72 |
|
|
#html = open("index.html") |
73 |
|
|
html = urllib.urlopen("http://www.imdb.com/nowplaying/") |
74 |
|
|
parser.feed(html.read()) |
75 |
|
|
html.close() |
76 |
|
|
movies = xml.sax.saxutils.escape(parser.output()) |
77 |
|
|
#movies = parser.output() |
78 |
|
|
print """<?xml version="1.0" encoding="utf-8"?> |
79 |
|
|
<feed xmlns="http://www.w3.org/2005/Atom"> |
80 |
|
|
|
81 |
|
|
<link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/> |
82 |
|
|
|
83 |
|
|
<title>This week's movies from IMDb</title> |
84 |
|
|
<updated>%(updated)sZ</updated> |
85 |
|
|
<author> |
86 |
|
|
<name>Andrew Pollock</name> |
87 |
|
|
</author> |
88 |
|
|
<id>http://www.andrew.net.au/</id> |
89 |
|
|
|
90 |
|
|
<entry> |
91 |
|
|
<id>http://www.imdb.com/nowplaying/</id> |
92 |
|
|
|
93 |
|
|
<updated>%(updated)sZ</updated> |
94 |
|
|
<title>This week's movies</title> |
95 |
|
|
|
96 |
|
|
<content type="html"> |
97 |
|
|
%(movies)s |
98 |
|
|
</content> |
99 |
|
|
</entry> |
100 |
|
|
</feed> |
101 |
apollock |
7 |
""" % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies } |
102 |
apollock |
6 |
|
103 |
|
|
if __name__ == "__main__": |
104 |
|
|
main() |