/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Annotation of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 53 - (hide annotations)
Mon Mar 19 04:32:36 2012 UTC (10 years, 6 months ago) by apollock
File MIME type: text/x-python
File size: 1627 byte(s)
Do another pass to exclude the weekly top 10

1 apollock 6 #!/usr/bin/python
2    
3 apollock 50 import copy
4 apollock 49 import urllib2
5 apollock 6 import time
6     import datetime
7 apollock 50 import lxml.html
8 apollock 6 import xml.sax.saxutils
9    
10 apollock 51 crufty_classes = [
11     "rating rating-list",
12     "add_to_watchlist",
13     "see-more",
14     "rating_txt",
15 apollock 52 "header",
16 apollock 51 ]
17    
18 apollock 6 def main():
19 apollock 50 imdb = urllib2.urlopen("http://www.imdb.com/movies-in-theaters/")
20     doc = lxml.html.fromstring("".join(imdb.readlines()))
21     for element in doc.iter(tag=lxml.etree.Element):
22     if element.tag.endswith("div"):
23     if element.attrib.get("id", "") == "main":
24     break
25     new_releases = copy.deepcopy(element)
26 apollock 53 # Go deeper to ditch the Box Office Top Ten
27     for element in new_releases.iter(tag=lxml.etree.Element):
28     if element.attrib.get("class", "") == "list detail sub-list":
29     break
30     new_releases = copy.deepcopy(element)
31 apollock 51 for crufty_class in crufty_classes:
32     for cruft in new_releases.find_class(crufty_class):
33     cruft.drop_tree()
34 apollock 50 movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases))
35 apollock 6 print """<?xml version="1.0" encoding="utf-8"?>
36     <feed xmlns="http://www.w3.org/2005/Atom">
37    
38     <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
39    
40     <title>This week's movies from IMDb</title>
41     <updated>%(updated)sZ</updated>
42     <author>
43     <name>Andrew Pollock</name>
44     </author>
45     <id>http://www.andrew.net.au/</id>
46    
47     <entry>
48 apollock 49 <id>http://home.andrew.net.au/movies/%(timestamp)s</id>
49 apollock 6
50     <updated>%(updated)sZ</updated>
51     <title>This week's movies</title>
52    
53     <content type="html">
54     %(movies)s
55     </content>
56     </entry>
57     </feed>
58 apollock 16 """ % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) }
59 apollock 6
60     if __name__ == "__main__":
61     main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22