/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Contents of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 55 - (show annotations)
Mon Mar 19 05:10:38 2012 UTC (8 years ago) by apollock
File MIME type: text/x-python
File size: 2149 byte(s)
Small clean up

Use a variable for the URL to retrieve, and derive the base URL for relative URLs from that

1 #!/usr/bin/python
2
3 import copy
4 import urllib2
5 import urlparse
6 import time
7 import datetime
8 import lxml.html
9 import xml.sax.saxutils
10
11 crufty_classes = [
12 "rating rating-list",
13 "add_to_watchlist",
14 "see-more",
15 "rating_txt",
16 "header",
17 ]
18
19 URL = "http://www.imdb.com/movies-in-theaters/"
20 base_url = "://".join(urlparse.urlsplit(URL)[0:2])
21
22 def main():
23 imdb = urllib2.urlopen(URL)
24 doc = lxml.html.fromstring("".join(imdb.readlines()))
25 for element in doc.iter(tag=lxml.etree.Element):
26 if element.tag.endswith("div"):
27 if element.attrib.get("id", "") == "main":
28 break
29 new_releases = copy.deepcopy(element)
30 # Go deeper to ditch the Box Office Top Ten
31 for element in new_releases.iter():
32 if element.attrib.get("class", "") == "list detail sub-list":
33 break
34 new_releases = copy.deepcopy(element)
35 # Fix relative img links
36 for element in new_releases.iter():
37 if element.tag == "img" and element.attrib.get("src", "").startswith("/"):
38 element.attrib["src"] = base_url + element.attrib.get("src")
39 # Fix relative hyperlinks
40 for element in new_releases.iter():
41 if element.tag == "a" and element.attrib.get("href", "").startswith("/"):
42 element.attrib["href"] = base_url + element.attrib.get("href")
43 # Remove unnecessary classes
44 for crufty_class in crufty_classes:
45 for cruft in new_releases.find_class(crufty_class):
46 cruft.drop_tree()
47 movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases))
48 print """<?xml version="1.0" encoding="utf-8"?>
49 <feed xmlns="http://www.w3.org/2005/Atom">
50
51 <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
52
53 <title>This week's movies from IMDb</title>
54 <updated>%(updated)sZ</updated>
55 <author>
56 <name>Andrew Pollock</name>
57 </author>
58 <id>http://www.andrew.net.au/</id>
59
60 <entry>
61 <id>http://home.andrew.net.au/movies/%(timestamp)s</id>
62
63 <updated>%(updated)sZ</updated>
64 <title>This week's movies</title>
65
66 <content type="html">
67 %(movies)s
68 </content>
69 </entry>
70 </feed>
71 """ % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) }
72
73 if __name__ == "__main__":
74 main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22