safarisync

view safarisync/safarisync/safarisync.py @ 2:23cfad04ce3a

Handle pre-existing directories
author Douglas Mayle http://douglas.mayle.org
date Tue Feb 24 17:43:29 2009 -0500 (17 months ago)
parents ad12351ad4dd
children 66b7a589c7e6
line source
1 # A more pythonic construct for files
2 from __future__ import with_statement
4 ###############################################################################
5 # External Dependencies
6 ###############################################################################
7 # HTML text to DOM library
8 from lxml import html
10 # Library for node selection using CSS
11 from lxml import cssselect
13 ###############################################################################
14 # Standard Library imports
15 ###############################################################################
16 # Handles all of our connection requests
17 import urllib2, urllib
19 # To convert data into a fromat valid for GET and POST requests
20 from urllib import urlencode
22 # To cleanup our book titles so that they can be used as filenames
23 from re import sub
25 # Tools for working with files and directories
26 from os import makedirs, path, getcwd
28 # Module that allows us to prompt for a password without echoing
29 from getpass import getpass
31 # Some tools for understanding URLs
32 from urlparse import urlparse, urlunparse
34 # Standard logging module
35 import logging
37 ###############################################################################
38 # Generic constants
39 ###############################################################################
40 # A regex for selecting out the characters that are invalid and replacing them.
41 # TODO Checkout putting Unicode equivalent characters instead...
42 INVALID_FILE_CHARS = r'[?%*:|"<>/]'
44 # A mapping between logging strings and logging levels.
45 LOGLEVELS = {'debug': logging.DEBUG,
46 'info': logging.INFO,
47 'warning': logging.WARNING,
48 'error': logging.ERROR,
49 'critical': logging.CRITICAL}
51 # The default logging level for this program
52 DEFAULT_LOGGING = 'error'
54 ###############################################################################
55 # Constants related to Safari
56 ###############################################################################
57 # The list of input values necessary to request pdf generation
58 SAFARI_REQUESTPDF_FORM = {'__className': 'pdfdownload',
59 '__dlid': '',
60 '__pdfcurrentxmlid': '',
61 '__callOmniture': '1',
62 '__version': '1.1.1',
63 '__pdfaction': 'regenerate'}
65 URL_SAFARI_LOGIN = 'http://my.safaribooksonline.com/login'
66 URL_SAFARI_DOWNLOADS = 'http://safari.oreilly.com/mydownloads'
67 URL_SAFARI_REQUESTPDF = 'http://safari.oreilly.com/_ajax_overlaypdf'
69 ###############################################################################
70 # On to the source code
71 ###############################################################################
72 def config_cookie_support():
73 """Monkey patch the standard library modules to keep session cookies."""
74 # If you need to handle cookies in python, you have to monkey patch the
75 # libraries used to fetch files. The most common libraries used for this
76 # purpose are urllib and urllib2 (thankfully, they're consolidated in
77 # Python 3, but we're not there yet...)
78 logging.info('Monkey patching system libraries with cookie support.')
80 # If you are having problems with your cookies, it will be useful to setup
81 # an LWP Cookie jar, which allows us to inspect cookies in a human readable
82 # format. Use the following code instead.
83 #
84 # from cookielib import LWPCookieJar
85 # global cj
86 # cj = LWPCookieJar()
87 #
88 # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
90 import urllib2
91 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
93 # We depend on urllib2 to perform our patching, so we know it's here
94 urllib2.install_opener(opener)
96 from sys import modules
98 # If the program has urllib loaded, we'll patch that, as well.
99 if 'urllib' in modules:
100 logging.debug('urllib loaded, monkey patching it for cookie support')
101 modules['urllib']._urlopener = opener
103 # Ideally, we'd also like to patch lxml so that we can use it's built in
104 # facilities with cookies, as well, but lxml sometimes uses urllib and
105 # sometimes uses libxml's web facilities, which we can't patch. You have
106 # to be aware of this and work around the limitations.
108 def safari_login(user, password):
109 """Login to the Safari website to load a session cookie."""
110 logging.info("Logging in to the Safari website at %s." % URL_SAFARI_LOGIN)
111 doc = html.fromstring(urllib2.urlopen(URL_SAFARI_LOGIN).read(), base_url=URL_SAFARI_LOGIN)
114 login_forms = doc.cssselect('form[name="login"]')
115 if not login_forms:
116 logging.critical('Unable to find the login form, can\'t continue.')
117 raise EOFError
119 login_form = login_forms[0]
120 login_form.fields['login'] = user
121 login_form.fields['password'] = password
123 # For the purpose of this script, we assume success, so we don't care about
124 # the result. We really should verify this, though.
125 print "Logging into the account"
126 logging.debug('Submitting login form')
128 # lxml uses urllib by default for downloads. Since we've patched it for
129 # cookie support, this is sufficient for our needs.
130 html.submit_form(login_form)
132 def safari_get_downloads(filename=None,syncpath=getcwd()):
133 """Return the safari download metadata by retreiving the downloads page and
134 parsing it."""
135 # Read and parse downloads
136 print "Getting the list of downloads"
137 logging.info("Retrieving Safari downloads page from %s" % filename)
138 doc = html.fromstring(urllib.urlopen(filename).read(), base_url=URL_SAFARI_DOWNLOADS)
140 # Get the list of table headers
141 headers = [header.text_content().strip().lower() for header in doc.cssselect("table.Content th")]
143 # In order to be a bit more resilient to changes in the document, we'll try
144 # to find the information we care about in the table.
145 metadata = ['book', 'section', 'pdf']
146 columns = {}
147 for column in metadata:
148 for header in enumerate(headers):
149 if column in header[1]:
150 # We store a one based index for css selection
151 columns[column] = header[0] + 1
153 if len(metadata) != len(columns):
154 logging.critical("Unable to find download metadata for these categories: '%s' from headers:\n%s" % \
155 ("','".join(metadata), "".join([html.tostring(el) for el in headers])))
156 raise EOFError
159 ###################################################
160 # Some helper functions for extracting cell content
161 ###################################################
162 def get_link(cell):
163 "Get the href of the first a node, or return an empty string"
164 link = cell.cssselect('a') and cell.cssselect('a')[0].attrib['href'] or ''
165 # This dance cleans up some technically valid, but useless links like '#'
166 link = urlunparse(urlparse(link))
167 return link
169 def get_text(cell):
170 "Return the text content of the cell, strip leading and trailing whitespace."
171 text = cell.text_content().strip()
173 # Because of a bug either in lxml, or the libraries it depends on
174 # (libxml, libxslt), it reads the utf-8 document and treats it as
175 # if it were latin-1. We fix the the encoding mistake.
176 try:
177 return text.encode('latin-1').decode('utf-8')
178 except UnicodeEncodeError:
179 # This probably means that the text was properly decoded and it
180 # contains characters not valid in the latin-l set. We'll let this
181 # pass.
182 return text
183 except UnicodeDecodeError:
184 # Just in case this bug exists only on my system, we'll ignore it
185 # if the 'fix' doesn't work.
186 return text
188 # We'll keep track of whether or not we requested pdfs so that we can print
189 # a helpful error message
190 requested_pdfs = False
192 # Extract a list or table row elements, each one containing data about one
193 # download
194 rows = doc.cssselect("table.Content tbody tr")
195 for index, row in enumerate(rows):
196 try:
197 titlecell = row.cssselect('td:nth-child(%d)' % columns['book'])[0]
198 sectioncell = row.cssselect('td:nth-child(%d)' % columns['section'])[0]
199 downloadcell = row.cssselect('td:nth-child(%d)' % columns['pdf'])[0]
200 except IndexError:
201 logging.error("Unable to extract download data from cell:\n%s" % html.tostring(row))
202 continue
204 # Safari book downloads don't have a section text.
205 if get_text(sectioncell):
206 progress_message = "Handling Section '%s' of Book '%s'" % (get_text(sectioncell), get_text(titlecell))
207 pdffile = '%s.pdf' % get_sanitized_path([syncpath, get_text(titlecell), get_text(sectioncell)])
208 requestid = get_link(sectioncell)
209 else:
210 progress_message = "Handling Book %s" % get_text(titlecell)
211 pdffile = '%s.pdf' % get_sanitized_path([syncpath, get_text(titlecell), get_text(titlecell)])
212 requestid = get_link(titlecell)
214 if not path.exists(pdffile):
215 logging.info("%d of %d:%s" % (index+1, len(rows), progress_message))
216 if get_link(downloadcell):
217 logging.debug("Downloading file from %s" % get_link(downloadcell))
218 downloadfile(get_link(downloadcell), pdffile)
219 else:
220 logging.debug("Reqeusting PDF generation for ID %s" % requestid)
221 requestpdf(requestid, requestid)
222 requested_pdfs = True
223 else:
224 logging.debug("%d of %d:%s" % (index+1, len(rows), progress_message))
226 if requested_pdfs:
227 print "Not all of the missing PDFs were found. They have been " \
228 "requested from Safari, so please rerun this after generation is " \
229 "complete."
231 def downloadfile(link,filepath):
232 """Download the file from the given link, and save it to the specified filepath"""
233 filedir = path.dirname(filepath)
234 try:
235 makedirs(filedir)
236 except OSError:
237 # The error is raised even if the directory already exists. If so, we
238 # ignore the error.
239 if not path.exists(filedir):
240 logging.error('Unable to create directory: %s' % filedir)
241 return
243 response = urllib2.urlopen(link)
244 with open(filepath, 'w') as pdf:
245 pdf.write(response.read())
247 def get_sanitized_path(pathlist):
248 """Turn a list of path elements into a path, while sanitizing the characters"""
249 return path.join(*[sub(INVALID_FILE_CHARS, '_', subpath) for subpath in pathlist])
251 def requestpdf(downloadid, xmlid):
252 """Submit a PDF generation request. This is now an AJAX only interface, so
253 we hack it instead of connecting to a web page to fill out the form."""
254 form_values = SAFARI_REQUESTPDF_FORM.copy()
255 form_values['__dlid'] = downloadid
256 form_values['__pdfcurrentxmlid'] = xmlid
258 postdata = urlencode(form_values)
260 loginrequest = urllib2.Request("%s?%s" % (URL_SAFARI_REQUESTPDF, postdata))
261 response = urllib2.urlopen(loginrequest)
262 response.read()
264 def prompt_user_pass(user, password):
265 "Request a user and password, taking into account data from the command line."
266 if not user:
267 user = raw_input("Please input the username for your Safari account.\n")
268 if not password:
269 password = getpass("Please input the password for your Safari account.\n")
270 return user, password
272 def main():
273 from optparse import OptionParser
274 parser = OptionParser()
276 parser.add_option('-u','--username',
277 dest='username',
278 default=None,
279 help='Username of the Safari account to sync')
280 parser.add_option('-p','--password',
281 dest='password',
282 default=None,
283 help='Password of the Safari account to sync')
284 parser.add_option('-d','--dest',
285 dest='path',
286 default='books',
287 help='Path of the folder to sync downloads to. This defaults to books subdirectory.')
288 parser.add_option('-l','--logging',
289 dest='loglevel',
290 default='error',
291 help='Change the logging level of this application. Possible choices are "%s".' % ', '.join(LOGLEVELS.keys()))
292 # This normally won't be used unless someone is debugging the html
293 # scraping. In that case, it saves the effort of supplying the user and
294 # password and connecting to the server.
295 parser.add_option('-s','--simulate',
296 dest='simulate',
297 default=None,
298 help='Provide a local copy of the downloads page for simulation.')
300 options, arguments = parser.parse_args()
302 # We only allow a set list of log levels. If the one supplied is bogus,
303 # use the default, but notify the users in case that means we've munged
304 # some other parameter.
305 if options.loglevel not in LOGLEVELS:
306 logging.error("Invalid log level '%s', defaulting to '%s'" % (options.loglevel, DEFAULT_LOGGING))
307 logging.basicConfig(level=LOGLEVELS[DEFAULT_LOGGING])
308 else:
309 logging.basicConfig(level=LOGLEVELS[options.loglevel])
311 if options.simulate:
312 filename = path.realpath(path.join(getcwd(), options.simulate))
313 else:
314 filename = URL_SAFARI_DOWNLOADS
316 if not options.username or not options.password:
317 options.username, options.password = prompt_user_pass(options.username, options.password)
319 # Set up urllib2 to keep cookies.
320 config_cookie_support()
321 safari_login(options.username,options.password)
323 safari_get_downloads(filename, options.path)
325 if __name__ == "__main__":
326 main()