safarisync
view safarisync/safarisync/safarisync.py @ 2:23cfad04ce3a
Handle pre-existing directories
| author | Douglas Mayle http://douglas.mayle.org |
|---|---|
| date | Tue Feb 24 17:43:29 2009 -0500 (17 months ago) |
| parents | ad12351ad4dd |
| children | 66b7a589c7e6 |
line source
1 # A more pythonic construct for files
4 ###############################################################################
5 # External Dependencies
6 ###############################################################################
7 # HTML text to DOM library
10 # Library for node selection using CSS
13 ###############################################################################
14 # Standard Library imports
15 ###############################################################################
16 # Handles all of our connection requests
19 # To convert data into a fromat valid for GET and POST requests
22 # To cleanup our book titles so that they can be used as filenames
25 # Tools for working with files and directories
28 # Module that allows us to prompt for a password without echoing
31 # Some tools for understanding URLs
34 # Standard logging module
37 ###############################################################################
38 # Generic constants
39 ###############################################################################
40 # A regex for selecting out the characters that are invalid and replacing them.
41 # TODO Checkout putting Unicode equivalent characters instead...
44 # A mapping between logging strings and logging levels.
51 # The default logging level for this program
54 ###############################################################################
55 # Constants related to Safari
56 ###############################################################################
57 # The list of input values necessary to request pdf generation
69 ###############################################################################
70 # On to the source code
71 ###############################################################################
73 """Monkey patch the standard library modules to keep session cookies."""
74 # If you need to handle cookies in python, you have to monkey patch the
75 # libraries used to fetch files. The most common libraries used for this
76 # purpose are urllib and urllib2 (thankfully, they're consolidated in
77 # Python 3, but we're not there yet...)
80 # If you are having problems with your cookies, it will be useful to setup
81 # an LWP Cookie jar, which allows us to inspect cookies in a human readable
82 # format. Use the following code instead.
83 #
84 # from cookielib import LWPCookieJar
85 # global cj
86 # cj = LWPCookieJar()
87 #
88 # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
93 # We depend on urllib2 to perform our patching, so we know it's here
98 # If the program has urllib loaded, we'll patch that, as well.
103 # Ideally, we'd also like to patch lxml so that we can use it's built in
104 # facilities with cookies, as well, but lxml sometimes uses urllib and
105 # sometimes uses libxml's web facilities, which we can't patch. You have
106 # to be aware of this and work around the limitations.
109 """Login to the Safari website to load a session cookie."""
123 # For the purpose of this script, we assume success, so we don't care about
124 # the result. We really should verify this, though.
128 # lxml uses urllib by default for downloads. Since we've patched it for
129 # cookie support, this is sufficient for our needs.
133 """Return the safari download metadata by retreiving the downloads page and
134 parsing it."""
135 # Read and parse downloads
140 # Get the list of table headers
141 headers = [header.text_content().strip().lower() for header in doc.cssselect("table.Content th")]
143 # In order to be a bit more resilient to changes in the document, we'll try
144 # to find the information we care about in the table.
150 # We store a one based index for css selection
154 logging.critical("Unable to find download metadata for these categories: '%s' from headers:\n%s" % \
159 ###################################################
160 # Some helper functions for extracting cell content
161 ###################################################
163 "Get the href of the first a node, or return an empty string"
165 # This dance cleans up some technically valid, but useless links like '#'
170 "Return the text content of the cell, strip leading and trailing whitespace."
173 # Because of a bug either in lxml, or the libraries it depends on
174 # (libxml, libxslt), it reads the utf-8 document and treats it as
175 # if it were latin-1. We fix the the encoding mistake.
179 # This probably means that the text was properly decoded and it
180 # contains characters not valid in the latin-l set. We'll let this
181 # pass.
184 # Just in case this bug exists only on my system, we'll ignore it
185 # if the 'fix' doesn't work.
188 # We'll keep track of whether or not we requested pdfs so that we can print
189 # a helpful error message
192 # Extract a list or table row elements, each one containing data about one
193 # download
202 continue
204 # Safari book downloads don't have a section text.
206 progress_message = "Handling Section '%s' of Book '%s'" % (get_text(sectioncell), get_text(titlecell))
228 "requested from Safari, so please rerun this after generation is " \
229 "complete."
232 """Download the file from the given link, and save it to the specified filepath"""
237 # The error is raised even if the directory already exists. If so, we
238 # ignore the error.
241 return
248 """Turn a list of path elements into a path, while sanitizing the characters"""
252 """Submit a PDF generation request. This is now an AJAX only interface, so
253 we hack it instead of connecting to a web page to fill out the form."""
265 "Request a user and password, taking into account data from the command line."
291 help='Change the logging level of this application. Possible choices are "%s".' % ', '.join(LOGLEVELS.keys()))
292 # This normally won't be used unless someone is debugging the html
293 # scraping. In that case, it saves the effort of supplying the user and
294 # password and connecting to the server.
302 # We only allow a set list of log levels. If the one supplied is bogus,
303 # use the default, but notify the users in case that means we've munged
304 # some other parameter.
306 logging.error("Invalid log level '%s', defaulting to '%s'" % (options.loglevel, DEFAULT_LOGGING))
319 # Set up urllib2 to keep cookies.
