/[svn.andrew.net.au]/scripts/movies.py
ViewVC logotype

Annotation of /scripts/movies.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 51 - (hide annotations)
Fri Jan 6 05:52:07 2012 UTC (10 years, 8 months ago) by apollock
File MIME type: text/x-python
File size: 1387 byte(s)
Remove unnecessary HTML to get output below 100K

1 apollock 6 #!/usr/bin/python
2    
3 apollock 50 import copy
4 apollock 49 import urllib2
5 apollock 6 import time
6     import datetime
7 apollock 50 import lxml.html
8 apollock 6 import xml.sax.saxutils
9    
10 apollock 51 crufty_classes = [
11     "rating rating-list",
12     "add_to_watchlist",
13     "see-more",
14     "rating_txt",
15     ]
16    
17 apollock 6 def main():
18 apollock 50 imdb = urllib2.urlopen("http://www.imdb.com/movies-in-theaters/")
19     doc = lxml.html.fromstring("".join(imdb.readlines()))
20     for element in doc.iter(tag=lxml.etree.Element):
21     if element.tag.endswith("div"):
22     if element.attrib.get("id", "") == "main":
23     break
24     new_releases = copy.deepcopy(element)
25 apollock 51 for crufty_class in crufty_classes:
26     for cruft in new_releases.find_class(crufty_class):
27     cruft.drop_tree()
28 apollock 50 movies = xml.sax.saxutils.escape(lxml.html.tostring(new_releases))
29 apollock 6 print """<?xml version="1.0" encoding="utf-8"?>
30     <feed xmlns="http://www.w3.org/2005/Atom">
31    
32     <link href="http://home.andrew.net.au/~apollock/movies.xml" rel="self"/>
33    
34     <title>This week's movies from IMDb</title>
35     <updated>%(updated)sZ</updated>
36     <author>
37     <name>Andrew Pollock</name>
38     </author>
39     <id>http://www.andrew.net.au/</id>
40    
41     <entry>
42 apollock 49 <id>http://home.andrew.net.au/movies/%(timestamp)s</id>
43 apollock 6
44     <updated>%(updated)sZ</updated>
45     <title>This week's movies</title>
46    
47     <content type="html">
48     %(movies)s
49     </content>
50     </entry>
51     </feed>
52 apollock 16 """ % { 'updated': datetime.datetime.utcnow().isoformat()[0:19], 'movies': movies, 'timestamp': int(time.time()) }
53 apollock 6
54     if __name__ == "__main__":
55     main()

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.22