1 |
#!/usr/bin/python |
2 |
|
3 |
import copy |
4 |
import urllib2 |
5 |
import time |
6 |
import datetime |
7 |
import lxml.html |
8 |
import xml.sax.saxutils |
9 |
|
10 |
crufty_classes = [ |
11 |
"rating rating-list", |
12 |
"add_to_watchlist", |
13 |
"see-more", |
14 |
"rating_txt", |
15 |
"header", |
16 |
] |
17 |
|
18 |
def main(): |
19 |
imdb = urllib2.urlopen("http://www.imdb.com/movies-in-theaters/") |
20 |
doc = lxml.html.fromstring("".join(imdb.readlines())) |
21 |
for element in doc.iter(tag=lxml.etree.Element): |
22 |
if element.tag.endswith("div"): |
23 |
if element.attrib.get("id", "") == "main": |
24 |
break |
25 |
new_releases = copy.deepcopy(element) |
26 |
# Go deeper to ditch the Box Office Top Ten |
27 |
for element in new_releases.iter(tag=lxml.etree.Element): |
28 |
if element.attrib.get("class", "") == "list detail sub-list": |
29 |
break |
30 |
new_releases = copy.deepcopy(element) |
31 |
for crufty_class in crufty_classes: |
32 |
for cruft in new_releases.find_class(crufty_class): |
33 |
cruft.drop_tree() |
34 |
movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases)) |
35 |
print """<?xml version="1.0" encoding="utf-8"?> |
36 |
<feed xmlns="http://www.w3.org/2005/Atom"> |
37 |
|
38 |
<link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/> |
39 |
|
40 |
<title>This week's movies from IMDb</title> |
41 |
<updated>%(updated)sZ</updated> |
42 |
<author> |
43 |
<name>Andrew Pollock</name> |
44 |
</author> |
45 |
<id>http://www.andrew.net.au/</id> |
46 |
|
47 |
<entry> |
48 |
<id>http://home.andrew.net.au/movies/%(timestamp)s</id> |
49 |
|
50 |
<updated>%(updated)sZ</updated> |
51 |
<title>This week's movies</title> |
52 |
|
53 |
<content type="html"> |
54 |
%(movies)s |
55 |
</content> |
56 |
</entry> |
57 |
</feed> |
58 |
""" % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) } |
59 |
|
60 |
if __name__ == "__main__": |
61 |
main() |