#! /usr/bin/python # for handling the api via http import urllib, urllib2, base64, httplib # for xml parsing from xml.dom.ext.reader import Sax2 # other useful system libs import getopt import string import sys import time # globals, you can set the values here if you import deli.py, # and override on the command-line if you run it bookmarks="bookmarks.html" username="username" password="password" baseTag="ExperimentalImports" # this will assign the specified tag(s) to every url imported # by the script, may be handy later # update this if the the api url changes baseUrl="http://del.icio.us/api/" # debug debug = False # urllib2 handling code from the great tutorial at: # http://www.voidspace.org.uk/python/articles/urllib2.shtml # Table mapping response codes to messages; entries have the # form {code: (shortmessage, longmessage)}. httpresponses = { 100: ('Continue', 'Request received, please continue'), 101: ('Switching Protocols', 'Switching to new protocol; obey Upgrade header'), 200: ('OK', 'Request fulfilled, document follows'), 201: ('Created', 'Document created, URL follows'), 202: ('Accepted', 'Request accepted, processing continues off-line'), 203: ('Non-Authoritative Information', 'Request fulfilled from cache'), 204: ('No response', 'Request fulfilled, nothing follows'), 205: ('Reset Content', 'Clear input form for further input.'), 206: ('Partial Content', 'Partial content follows.'), 300: ('Multiple Choices', 'Object has several resources -- see URI list'), 301: ('Moved Permanently', 'Object moved permanently -- see URI list'), 302: ('Found', 'Object moved temporarily -- see URI list'), 303: ('See Other', 'Object moved -- see Method and URL list'), 304: ('Not modified', 'Document has not changed since given time'), 305: ('Use Proxy', 'You must use proxy specified in Location' ' to access this resource.'), 307: ('Temporary Redirect', 'Object moved temporarily -- see URI list'), 400: ('Bad request', 'Bad request syntax or unsupported method'), 401: ('Unauthorized', 'No permission -- see authorization schemes'), 402: ('Payment required', 'No payment -- see charging schemes'), 403: ('Forbidden', 'Request forbidden -- authorization will not help'), 404: ('Not Found', 'Nothing matches the given URI'), 405: ('Method Not Allowed', 'Specified method is invalid for this server.'), 406: ('Not Acceptable', 'URI not available in preferred format.'), 407: ('Proxy Authentication Required', 'You must authenticate with ' 'this proxy before proceeding.'), 408: ('Request Time-out', 'Request timed out; try again later.'), 409: ('Conflict', 'Request conflict.'), 410: ('Gone', 'URI no longer exists and has been permanently removed.'), 411: ('Length Required', 'Client must specify Content-Length.'), 412: ('Precondition Failed', 'Precondition in headers is false.'), 413: ('Request Entity Too Large', 'Entity is too large.'), 414: ('Request-URI Too Long', 'URI is too long.'), 415: ('Unsupported Media Type', 'Entity body in unsupported format.'), 416: ('Requested Range Not Satisfiable', 'Cannot satisfy request range.'), 417: ('Expectation Failed', 'Expect condition could not be satisfied.'), 500: ('Internal error', 'Server got itself in trouble'), 501: ('Not Implemented', 'Server does not support this operation'), 502: ('Bad Gateway', 'Invalid responses from another server/proxy.'), 503: ('Service temporarily overloaded', 'The server cannot ' 'process the request due to a high load'), 504: ('Gateway timeout', 'The gateway server did not receive a timely response'), 505: ('HTTP Version not supported', 'Cannot fulfill request.'), } # TODO: handle 503's, or retries for other reasons ... def api(method, args): maxRetries = 5 while maxRetries > 0: # delicious api asks for a 1 second delay between calls: drag! # impose it here in hardcode fashion; an improvement would # be to latch the time of last api call and sleep to the next 1s # interval. whatever. # time.sleep(1) url = baseUrl + method request = urllib2.Request(url, args) b64 = base64.encodestring('%s:%s' % (username, password))[:-1] request.add_header('Authorization', 'Basic %s' % b64) try: data = urllib2.urlopen(request) return True,data except IOError, e: code = e.code if code in httpresponses: print "%d: %s, %s" % \ (code, httpresponses[code][0], httpresponses[code][1]) else: print "%d: unknown code" % code if code == 503: time.sleep(1) maxRetries -= 1 except httplib.BadStatusLine, line: # occasionally urllib2 gets a response that it chokes on # haven't caught one live yet to debug further ... print "httplib.BadStatusLine" print line maxRetries -= 1 return False,data def getPosts(): posts = [] success,data = api("posts/all", None) if not success: return # create Reader object reader = Sax2.Reader() # parse the document doc = reader.fromStream(data) for post in doc.getElementsByTagName("post"): posts.append(post.getAttribute("href")) return posts def getTags(): tags = [] success,data = api("tags/get", None) if not success: return # create Reader object reader = Sax2.Reader() # parse the document doc = reader.fromStream(data) for tag in doc.getElementsByTagName("tag"): tags.append(tag.getAttribute("tag")) return tags def renameTag(old, new): args = { "old" : old, "new" : new } urlArgs = urllib.urlencode(args) success,data = api("tags/rename?", urlArgs) if not success: return False # create Reader object reader = Sax2.Reader() # parse the document doc = reader.fromStream(data) # XML response is always: #done # # In DOM the "result" tag is a NodeList with 1 Element Node, # said Element Node has 0 attributes and 1 childNode, whose nodeValue is "done" # result = doc.getElementsByTagName("result") if len(result) != 1: print "unexpected xml response:" print data return False if hasattr(result[0], "childNodes") and result[0].childNodes[0].nodeValue == "done": print "renamed" return True print "unexpected xml data" print data return result return False def post(url, desc, tags): tagsString = string.join(tags) args = { "url" : url, "description" : desc, "tags" : tagsString, "replace" : "no" # docs say no is default, but play it safe } urlArgs = urllib.urlencode(args) success,data = api("posts/add?", urlArgs) if not success: return # create Reader object reader = Sax2.Reader() # parse the document doc = reader.fromStream(data) # XML response if the post was successful: # # # XML response if the post failed: # # # In DOM the "result" tag is a NodeList with 1 Element Node, # said Element Node has 1 attribute, "code", and 0 children # result = doc.getElementsByTagName("result") if len(result) != 1: print "unexpected xml response: length != 1" print "post failed for:", desc print "\ttags were:", tagsString print data return False code = result[0].getAttribute("code") if code == "done": print "successfully posted:", desc print "\ttags were:", tagsString return result print "unexpected xml response: %s" % code print "post failed for:", desc print "\ttags were:", tagsString return False ################################################# # # HTML parser for Firefox bookmarks.html format # # NB: This was derived by examining the format of my bookmarks.html, # nothing more rigorous than that. # # I haven't looked for a specification of the format, or even # looked at any other samples of Firefox bookmark output, so there # are certain to be holes in this logic. # from HTMLParser import HTMLParser class MyHTMLParser(HTMLParser): def __init__(self): # this is python: explicitly invoke base class constructor HTMLParser.__init__(self) self.inH3 = False self.inA = False self.tagCount = 0 self.tags = [] self.currentTag = "" self.href = "" self.description = "" self.ignore = "" def setBaseTag(self, baseTag): self.tags.append(baseTag) def setIgnoreUrls(self, ignore): self.ignore = ignore # remove white space # remove apostrophes, quote, double-quotes, colons, commas def normalizeText(self, text): text = text.replace('\'', '') text = text.replace('"', '') text = text.replace('`', '') text = text.replace(':', '') text = text.replace(',', '') text = text.replace(' ', '') text = text.replace(' ', '') return text def handle_starttag(self, tag, attrs): if tag == "a": self.inA = True for attr in attrs: if attr[0] == "href": self.href = attr[1] if tag == "h3": self.inH3 = True self.tagCount += 1 if tag == "dl": pass #print "Entering folder list; tags are", self.tags def handle_endtag(self, tag): if tag == "h3": self.tags.append(self.currentTag) self.currentTag = "" self.inH3 = False if tag == "a": if debug == True: print print "href =", self.href print "description =", self.description print "tags =", self.tags # validate href validHref = True if len(self.href) == 0: validHref = False if not self.href.split(":")[0] in ["http", "https", "news", "ftp"]: validHref = False if self.href in self.ignore: validHref = False # actually post here, make sure there's a url to post if validHref: post(self.href, self.description, self.tags) self.href = "" self.description = "" self.inA = False # exiting a dl means end of a bookmarks folder, pop the last tag off if tag == "dl": self.tags = self.tags[:-1] # handle any data: note that this will miss the "escaped" stuff # fix this by adding handle_charref, etc methods def handle_data(self, data): if self.inH3: self.currentTag += self.normalizeText(data) if self.inA: self.description += data def doit(): # retrieve the full list of posts to avoid requests for dup's print "Retrieving your existing posts to speed the upload ..." posts = getPosts() print "Got them: you have %d posts on del.icio.us now" % len(posts) # construct and configure the parser parser = MyHTMLParser() if baseTag and len(baseTag) > 0: parser.setBaseTag(baseTag) parser.setIgnoreUrls(posts) # initiate the parse; this will submit requests to delicious parser.feed(open(bookmarks).read()) # cleanup parser.close() def usage(): print "Usage: deli.py --bookmarks= --username= --password=]" print " bookmarks, username, password should be self explanatory" print " tags is a white-space separated list of tags to apply to all bookmarks" def main(): try: opts, args = getopt.getopt(sys.argv[1:], "b:u:p:t:", ["bookmarks=", "username=", "password=", "tags="]) except getopt.GetoptError: # print help information and exit: usage() sys.exit(2) # use the globals global bookmarks global username global password global baseTag for o, a in opts: if o in ("-b", "--bookmarks"): bookmarks = a if o in ("-u", "--username"): username = a if o in ("-p", "--password"): password = a if o in ("-t", "--tags"): baseTag = a if not bookmarks or not username or not password: usage() sys.exit(1) # go forth doit() main()