1 |
#!/usr/bin/python |
2 |
|
3 |
from sgmllib import SGMLParser |
4 |
import htmlentitydefs |
5 |
import urllib |
6 |
import time |
7 |
import datetime |
8 |
import xml.sax.saxutils |
9 |
|
10 |
class BaseHTMLProcessor(SGMLParser): |
11 |
|
12 |
def reset(self): |
13 |
self.pieces = [] |
14 |
self.include = False |
15 |
self.nestlevel = 0 |
16 |
SGMLParser.reset(self) |
17 |
|
18 |
def unknown_starttag(self, tag, attrs): |
19 |
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) |
20 |
if tag == "table": |
21 |
#print "Saw a table" |
22 |
if 'class' in attrs[0] and 'movies' in attrs[0]: |
23 |
self.include = True |
24 |
if self.include: |
25 |
self.nestlevel += 1 |
26 |
|
27 |
if self.include: |
28 |
self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) |
29 |
|
30 |
|
31 |
def unknown_endtag(self, tag): |
32 |
if self.include: |
33 |
self.pieces.append("</%(tag)s>" % locals()) |
34 |
if tag == "table": |
35 |
self.nestlevel -= 1 |
36 |
if self.nestlevel == 0: |
37 |
self.include = False |
38 |
|
39 |
|
40 |
def handle_charref(self, ref): |
41 |
if self.include: |
42 |
self.pieces.append("&#%(ref)s;" % locals()) |
43 |
|
44 |
def handle_entityref(self, ref): |
45 |
if self.include: |
46 |
self.pieces.append("&%(ref)s" % locals()) |
47 |
if htmlentitydefs.entitydefs.has_key(ref): |
48 |
self.pieces.append(";") |
49 |
|
50 |
def handle_data(self, text): |
51 |
if self.include: |
52 |
self.pieces.append(text) |
53 |
|
54 |
def handle_comment(self, text): |
55 |
if self.include: |
56 |
self.pieces.append("<!--%(text)s-->" % locals()) |
57 |
|
58 |
def handle_pi(self, text): |
59 |
if self.include: |
60 |
self.pieces.append("<?%(text)s>" % locals()) |
61 |
|
62 |
def handle_decl(self, text): |
63 |
if self.include: |
64 |
self.pieces.append("<!%(text)s>" % locals()) |
65 |
|
66 |
def output(self): |
67 |
return "".join(self.pieces) |
68 |
|
69 |
|
70 |
def main(): |
71 |
parser = BaseHTMLProcessor() |
72 |
#html = open("index.html") |
73 |
html = urllib.urlopen("http://www.imdb.com/nowplaying/") |
74 |
parser.feed(html.read()) |
75 |
html.close() |
76 |
movies = xml.sax.saxutils.escape(parser.output()) |
77 |
#movies = parser.output() |
78 |
print """<?xml version="1.0" encoding="utf-8"?> |
79 |
<feed xmlns="http://www.w3.org/2005/Atom"> |
80 |
|
81 |
<link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/> |
82 |
|
83 |
<title>This week's movies from IMDb</title> |
84 |
<updated>%(updated)sZ</updated> |
85 |
<author> |
86 |
<name>Andrew Pollock</name> |
87 |
</author> |
88 |
<id>http://www.andrew.net.au/</id> |
89 |
|
90 |
<entry> |
91 |
<id>http://www.imdb.com/nowplaying/</id> |
92 |
|
93 |
<updated>%(updated)sZ</updated> |
94 |
<title>This week's movies</title> |
95 |
|
96 |
<content type="html"> |
97 |
%(movies)s |
98 |
</content> |
99 |
</entry> |
100 |
</feed> |
101 |
""" % { 'updated': datetime.datetime.utcnow().isoformat(), 'movies': movies } |
102 |
|
103 |
if __name__ == "__main__": |
104 |
main() |