1 |
#!/usr/bin/python |
2 |
|
3 |
import copy |
4 |
import urllib2 |
5 |
import urlparse |
6 |
import time |
7 |
import datetime |
8 |
import lxml.html |
9 |
import xml.sax.saxutils |
10 |
|
11 |
crufty_classes = [ |
12 |
"rating rating-list", |
13 |
"add_to_watchlist", |
14 |
"see-more", |
15 |
"rating_txt", |
16 |
"header", |
17 |
] |
18 |
|
19 |
URL = "http://www.imdb.com/movies-in-theaters/" |
20 |
base_url = "://".join(urlparse.urlsplit(URL)[0:2]) |
21 |
|
22 |
def main(): |
23 |
imdb = urllib2.urlopen(URL) |
24 |
doc = lxml.html.fromstring("".join(imdb.readlines())) |
25 |
for element in doc.iter(tag=lxml.etree.Element): |
26 |
if element.tag.endswith("div"): |
27 |
if element.attrib.get("id", "") == "main": |
28 |
break |
29 |
new_releases = copy.deepcopy(element) |
30 |
# Go deeper to ditch the Box Office Top Ten |
31 |
for element in new_releases.iter(): |
32 |
if element.attrib.get("class", "") == "list detail sub-list": |
33 |
break |
34 |
new_releases = copy.deepcopy(element) |
35 |
# Fix relative img links |
36 |
for element in new_releases.iter(): |
37 |
if element.tag == "img" and element.attrib.get("src", "").startswith("/"): |
38 |
element.attrib["src"] = base_url + element.attrib.get("src") |
39 |
# Fix relative hyperlinks |
40 |
for element in new_releases.iter(): |
41 |
if element.tag == "a" and element.attrib.get("href", "").startswith("/"): |
42 |
element.attrib["href"] = base_url + element.attrib.get("href") |
43 |
# Remove unnecessary classes |
44 |
for crufty_class in crufty_classes: |
45 |
for cruft in new_releases.find_class(crufty_class): |
46 |
cruft.drop_tree() |
47 |
movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases)) |
48 |
print """<?xml version="1.0" encoding="utf-8"?> |
49 |
<feed xmlns="http://www.w3.org/2005/Atom"> |
50 |
|
51 |
<link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/> |
52 |
|
53 |
<title>This week's movies from IMDb</title> |
54 |
<updated>%(updated)sZ</updated> |
55 |
<author> |
56 |
<name>Andrew Pollock</name> |
57 |
</author> |
58 |
<id>http://www.andrew.net.au/</id> |
59 |
|
60 |
<entry> |
61 |
<id>http://home.andrew.net.au/movies/%(timestamp)s</id> |
62 |
|
63 |
<updated>%(updated)sZ</updated> |
64 |
<title>This week's movies</title> |
65 |
|
66 |
<content type="html"> |
67 |
%(movies)s |
68 |
</content> |
69 |
</entry> |
70 |
</feed> |
71 |
""" % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) } |
72 |
|
73 |
if __name__ == "__main__": |
74 |
main() |