/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Annotation of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 55 - (hide annotations)
Mon Mar 19 05:10:38 2012 UTC (9 years, 8 months ago) by apollock
File MIME type: text/x-python
File size: 2149 byte(s)
Small clean up

Use a variable for the URL to retrieve, and derive the base URL for relative URLs from that

1 apollock 6 #!/usr/bin/python
2    
3 apollock 50 import copy
4 apollock 49 import urllib2
5 apollock 55 import urlparse
6 apollock 6 import time
7     import datetime
8 apollock 50 import lxml.html
9 apollock 6 import xml.sax.saxutils
10    
11 apollock 51 crufty_classes = [
12     "rating rating-list",
13     "add_to_watchlist",
14     "see-more",
15     "rating_txt",
16 apollock 52 "header",
17 apollock 51 ]
18    
19 apollock 55 URL = "http://www.imdb.com/movies-in-theaters/"
20     base_url = "://".join(urlparse.urlsplit(URL)[0:2])
21 apollock 54
22 apollock 6 def main():
23 apollock 55 imdb = urllib2.urlopen(URL)
24 apollock 50 doc = lxml.html.fromstring("".join(imdb.readlines()))
25     for element in doc.iter(tag=lxml.etree.Element):
26     if element.tag.endswith("div"):
27     if element.attrib.get("id", "") == "main":
28     break
29     new_releases = copy.deepcopy(element)
30 apollock 53 # Go deeper to ditch the Box Office Top Ten
31 apollock 54 for element in new_releases.iter():
32 apollock 53 if element.attrib.get("class", "") == "list detail sub-list":
33     break
34     new_releases = copy.deepcopy(element)
35 apollock 54 # Fix relative img links
36     for element in new_releases.iter():
37     if element.tag == "img" and element.attrib.get("src", "").startswith("/"):
38 apollock 55 element.attrib["src"] = base_url + element.attrib.get("src")
39 apollock 54 # Fix relative hyperlinks
40     for element in new_releases.iter():
41     if element.tag == "a" and element.attrib.get("href", "").startswith("/"):
42 apollock 55 element.attrib["href"] = base_url + element.attrib.get("href")
43 apollock 54 # Remove unnecessary classes
44 apollock 51 for crufty_class in crufty_classes:
45     for cruft in new_releases.find_class(crufty_class):
46     cruft.drop_tree()
47 apollock 50 movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases))
48 apollock 6 print """<?xml version="1.0" encoding="utf-8"?>
49     <feed xmlns="http://www.w3.org/2005/Atom">
50    
51     <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
52    
53     <title>This week's movies from IMDb</title>
54     <updated>%(updated)sZ</updated>
55     <author>
56     <name>Andrew Pollock</name>
57     </author>
58     <id>http://www.andrew.net.au/</id>
59    
60     <entry>
61 apollock 49 <id>http://home.andrew.net.au/movies/%(timestamp)s</id>
62 apollock 6
63     <updated>%(updated)sZ</updated>
64     <title>This week's movies</title>
65    
66     <content type="html">
67     %(movies)s
68     </content>
69     </entry>
70     </feed>
71 apollock 16 """ % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) }
72 apollock 6
73     if __name__ == "__main__":
74     main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22