safarisync

view safarisync/safarisync.py @ 14:811bb2e2ed2f

Fix the logging type on a debug message. Disable the use of urlretrieve because of a problem on python 2.6 and windows. Turn on binary file mode, since it's needed on windows.
author Douglas Mayle http://douglas.mayle.org
date Fri Feb 27 21:27:49 2009 +0000 (12 months ago)
parents 17f2ae3f60d8
children
line source
1 #!/usr/bin/env python
3 # HTML text to DOM library
4 from lxml import html
6 # Net and url based tools
7 import urllib2, urllib
8 from urlparse import urlparse, urlunparse
10 # To cleanup our book titles so that they can be used as filenames
11 from re import compile
13 # Tools for working with files and directories
14 from os import makedirs, path, getcwd
16 # Module that allows us to prompt for a password without echoing
17 from getpass import getpass
19 # Standard logging module
20 import logging
22 # A regex for selecting out the characters that are invalid and replacing them.
23 # TODO Checkout putting Unicode equivalent characters instead...
24 INVALID_FILE_CHARS = compile(r'[?%*:|"<>/]')
26 # A mapping between logging strings and logging levels.
27 LOGLEVELS = {'debug': logging.DEBUG,
28 'info': logging.INFO,
29 'warning': logging.WARNING,
30 'error': logging.ERROR,
31 'critical': logging.CRITICAL}
33 # The default logging level for this program
34 DEFAULT_LOGGING = 'info'
36 # The list of input values necessary to request pdf generation
37 SAFARI_REQUESTPDF_FORM = {'__className': 'pdfdownload',
38 '__dlid': '',
39 '__pdfcurrentxmlid': '',
40 '__callOmniture': '1',
41 '__version': '1.1.1',
42 '__pdfaction': 'regenerate'}
44 URL_SAFARI_LOGIN = 'http://my.safaribooksonline.com/login'
45 URL_SAFARI_DOWNLOADS = 'http://safari.oreilly.com/mydownloads'
46 URL_SAFARI_REQUESTPDF = 'http://safari.oreilly.com/_ajax_overlaypdf'
48 def config_cookie_support():
49 """Monkey patch the standard library modules to keep session cookies."""
50 # If you need to handle cookies in python, you have to monkey patch the
51 # libraries used to fetch files. The most common libraries used for this
52 # purpose are urllib and urllib2 (thankfully, they're consolidated in
53 # Python 3, but we're not there yet...)
54 logging.debug('Monkey patching system libraries with cookie support.')
56 # If you are having problems with your cookies, it will be useful to setup
57 # an LWP Cookie jar, which allows us to inspect cookies in a human readable
58 # format. Use the following code instead.
59 #
60 # from cookielib import LWPCookieJar
61 # global cj
62 # cj = LWPCookieJar()
63 #
64 # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
66 import urllib2
67 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
69 # We depend on urllib2 to perform our patching, so we know it's here
70 urllib2.install_opener(opener)
72 from sys import modules
74 # If the program has urllib loaded, we'll patch that, as well.
75 if 'urllib' in modules:
76 logging.debug('urllib loaded, monkey patching it for cookie support')
77 modules['urllib']._urlopener = opener
79 # Ideally, we'd also like to patch lxml so that we can use it's built in
80 # facilities with cookies, as well, but lxml sometimes uses urllib and
81 # sometimes uses libxml's web facilities, which we can't patch. You have
82 # to be aware of this and work around the limitations.
84 def safari_login(user, password):
85 """Login to the Safari website to load a session cookie."""
86 logging.debug("Connecting to the Safari website at %s." % URL_SAFARI_LOGIN)
87 doc = html.fromstring(urllib2.urlopen(URL_SAFARI_LOGIN).read(), base_url=URL_SAFARI_LOGIN)
90 login_forms = doc.cssselect('form[name="login"]')
91 if not login_forms:
92 logging.critical('Unable to find the login form, can\'t continue.')
93 raise EOFError
95 login_form = login_forms[0]
96 login_form.fields['login'] = user
97 login_form.fields['password'] = password
99 # For the purpose of this script, we assume success, so we don't care about
100 # the result. We really should verify this, though.
101 logging.info("Logging into the account")
102 logging.debug('Submitting login form')
104 # lxml uses urllib by default for downloads. Since we've patched it for
105 # cookie support, this is sufficient for our needs.
106 html.submit_form(login_form)
108 def safari_get_downloads(filename=None,syncpath=getcwd()):
109 """Connect to Safari to retrieve the data, and then download any files not
110 on the local disk. Request any unavailable PDFs if necessary."""
111 # Read and parse downloads
112 logging.info("Retrieving Safari downloads page from %s" % filename)
113 doc = html.fromstring(urllib.urlopen(filename).read(), base_url=URL_SAFARI_DOWNLOADS)
115 # Get the list of table headers
116 headers = [header.text_content().strip().lower() for header in doc.cssselect("table.Content th")]
118 # In order to be a bit more resilient to changes in the document, we'll try
119 # to find the information we care about in the table.
120 metadata = ['book', 'section', 'pdf']
121 columns = {}
122 for column in metadata:
123 for header in enumerate(headers):
124 if column in header[1]:
125 # We store a one based index for css selection
126 columns[column] = header[0] + 1
128 if len(metadata) != len(columns):
129 logging.critical("Unable to find download metadata for these categories: '%s' from headers:\n%s" % \
130 ("','".join(metadata), "".join([html.tostring(el) for el in headers])))
131 raise EOFError
134 ###################################################
135 # Some helper functions for extracting cell content
136 ###################################################
137 def get_link(cell):
138 "Get the href of the first a node, or return an empty string"
139 link = cell.cssselect('a') and cell.cssselect('a')[0].attrib['href'] or ''
140 # This dance cleans up some technically valid, but useless links like '#'
141 link = urlunparse(urlparse(link))
142 return link
144 def get_text(cell):
145 "Return the text content of the cell, strip leading and trailing whitespace."
146 text = cell.text_content().strip()
148 # Because of a bug either in lxml, or the libraries it depends on
149 # (libxml, libxslt), it reads the utf-8 document and treats it as
150 # if it were latin-1. We fix the the encoding mistake.
151 try:
152 return text.encode('latin-1').decode('utf-8')
153 except UnicodeEncodeError:
154 # This probably means that the text was properly decoded and it
155 # contains characters not valid in the latin-l set. We'll let this
156 # pass.
157 return text
158 except UnicodeDecodeError:
159 # Just in case this bug exists only on my system, we'll ignore it
160 # if the 'fix' doesn't work.
161 return text
163 # We'll keep track of whether or not we requested pdfs so that we can print
164 # a helpful error message
165 requested_pdfs = False
167 # Extract a list or table row elements, each one containing data about one
168 # download
169 rows = doc.cssselect("table.Content tbody tr")
170 for index, row in enumerate(rows):
171 try:
172 titlecell = row.cssselect('td:nth-child(%d)' % columns['book'])[0]
173 sectioncell = row.cssselect('td:nth-child(%d)' % columns['section'])[0]
174 downloadcell = row.cssselect('td:nth-child(%d)' % columns['pdf'])[0]
175 except IndexError:
176 logging.error("Unable to extract download data from cell:\n%s" % html.tostring(row))
177 continue
179 # Safari book downloads don't have a section text.
180 if get_text(sectioncell):
181 progress_message = "Handling Section '%s' of Book '%s'" % (get_text(sectioncell), get_text(titlecell))
182 pdffile = '%s.pdf' % get_sanitized_path([syncpath, get_text(titlecell), get_text(sectioncell)])
183 requestid = get_link(sectioncell).lstrip('/')
184 else:
185 progress_message = "Handling Book %s" % get_text(titlecell)
186 pdffile = '%s.pdf' % get_sanitized_path([syncpath, get_text(titlecell), get_text(titlecell)])
187 requestid = get_link(titlecell).lstrip('/')
189 if not path.exists(pdffile):
190 logging.info("%d of %d:%s" % (index+1, len(rows), progress_message))
191 if get_link(downloadcell):
192 logging.debug("Downloading file from %s" % get_link(downloadcell))
193 downloadfile(get_link(downloadcell), pdffile)
194 else:
195 logging.debug("Reqeusting PDF generation for ID %s" % requestid)
196 requestpdf(requestid, requestid)
197 requested_pdfs = True
198 else:
199 logging.debug("%d of %d:%s" % (index+1, len(rows), progress_message))
201 if requested_pdfs:
202 logging.info("Not all of the missing PDFs were found. They have been " \
203 "requested from Safari, so please rerun this after generation is " \
204 "complete.")
206 def downloadfile(link,filepath):
207 """Download the file from the given link, and save it to the specified filepath"""
208 filedir = path.dirname(filepath)
209 try:
210 makedirs(filedir)
211 except OSError:
212 # The error is raised even if the directory already exists. If so, we
213 # ignore the error.
214 if not path.exists(filedir):
215 logging.error('Unable to create directory: %s' % filedir)
216 return
218 # urllib.urlretrieve
219 #urllib.urlretrieve(link, filepath)
220 response = urllib2.urlopen(link)
221 with open(filepath, 'wb') as pdf:
222 pdf.write(response.read())
224 def get_sanitized_path(pathlist):
225 """Turn a list of path elements into a path, while sanitizing the characters"""
226 return path.join(*[INVALID_FILE_CHARS.sub('_', subpath) for subpath in pathlist])
228 def requestpdf(downloadid, xmlid):
229 """Submit a PDF generation request. This is now an AJAX only interface, so
230 we hack it instead of connecting to a web page to fill out the form."""
231 form_values = SAFARI_REQUESTPDF_FORM.copy()
232 form_values['__dlid'] = downloadid
233 form_values['__pdfcurrentxmlid'] = xmlid
235 postdata = urllib.urlencode(form_values)
237 response = urllib2.urlopen("%s?%s" % (URL_SAFARI_REQUESTPDF, postdata))
238 response.read()
240 def prompt_user_pass(user, password):
241 "Request a user and password, taking into account data from the command line."
242 if not user:
243 user = raw_input("Please input the username for your Safari account.\n")
244 if not password:
245 password = getpass("Please input the password for your Safari account.\n")
246 return user, password
248 def main():
249 from optparse import OptionParser
250 parser = OptionParser()
252 parser.add_option('-u','--username',
253 dest='username',
254 default=None,
255 help='Username of the Safari account to sync')
256 parser.add_option('-p','--password',
257 dest='password',
258 default=None,
259 help='Password of the Safari account to sync')
260 parser.add_option('-d','--dest',
261 dest='path',
262 default='books',
263 help='Path of the folder to sync downloads to. This defaults to books subdirectory.')
264 parser.add_option('-l','--logging',
265 dest='loglevel',
266 default=DEFAULT_LOGGING,
267 help='Change the logging level of this application. Possible choices are "%s".' % ', '.join(LOGLEVELS.keys()))
268 # This normally won't be used unless someone is debugging the html
269 # scraping. In that case, it saves the effort of supplying the user and
270 # password and connecting to the server.
271 parser.add_option('-s','--simulate',
272 dest='simulate',
273 default=None,
274 help='Provide a local copy of the downloads page for simulation.')
276 options, arguments = parser.parse_args()
278 # We only allow a set list of log levels. If the one supplied is bogus,
279 # use the default, but notify the users in case that means we've munged
280 # some other parameter.
281 if options.loglevel not in LOGLEVELS:
282 logging.error("Invalid log level '%s', defaulting to '%s'" % (options.loglevel, DEFAULT_LOGGING))
283 logging.basicConfig(level=LOGLEVELS[DEFAULT_LOGGING], format="%(levelname)-8s %(message)s")
284 else:
285 logging.basicConfig(level=LOGLEVELS[options.loglevel], format="%(levelname)-8s %(message)s")
287 if options.simulate:
288 filename = path.realpath(path.join(getcwd(), options.simulate))
289 else:
290 filename = URL_SAFARI_DOWNLOADS
292 if not options.username or not options.password:
293 options.username, options.password = prompt_user_pass(options.username, options.password)
295 # Set up urllib2 to keep cookies.
296 config_cookie_support()
297 safari_login(options.username,options.password)
299 safari_get_downloads(filename, options.path)
301 if __name__ == "__main__":
302 main()