#! /usr/bin/python
# for handling the api via http
import urllib, urllib2, base64, httplib
# for xml parsing
from xml.dom.ext.reader import Sax2
# other useful system libs
import getopt
import string
import sys
import time
# globals, you can set the values here if you import deli.py,
# and override on the command-line if you run it
bookmarks="bookmarks.html"
username="username"
password="password"
baseTag="ExperimentalImports" # this will assign the specified tag(s) to every url imported
# by the script, may be handy later
# update this if the the api url changes
baseUrl="http://del.icio.us/api/"
# debug
debug = False
# urllib2 handling code from the great tutorial at:
# http://www.voidspace.org.uk/python/articles/urllib2.shtml
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
httpresponses = {
100: ('Continue', 'Request received, please continue'),
101: ('Switching Protocols',
'Switching to new protocol; obey Upgrade header'),
200: ('OK', 'Request fulfilled, document follows'),
201: ('Created', 'Document created, URL follows'),
202: ('Accepted',
'Request accepted, processing continues off-line'),
203: ('Non-Authoritative Information',
'Request fulfilled from cache'),
204: ('No response', 'Request fulfilled, nothing follows'),
205: ('Reset Content', 'Clear input form for further input.'),
206: ('Partial Content', 'Partial content follows.'),
300: ('Multiple Choices',
'Object has several resources -- see URI list'),
301: ('Moved Permanently',
'Object moved permanently -- see URI list'),
302: ('Found', 'Object moved temporarily -- see URI list'),
303: ('See Other', 'Object moved -- see Method and URL list'),
304: ('Not modified',
'Document has not changed since given time'),
305: ('Use Proxy',
'You must use proxy specified in Location'
' to access this resource.'),
307: ('Temporary Redirect',
'Object moved temporarily -- see URI list'),
400: ('Bad request',
'Bad request syntax or unsupported method'),
401: ('Unauthorized',
'No permission -- see authorization schemes'),
402: ('Payment required',
'No payment -- see charging schemes'),
403: ('Forbidden',
'Request forbidden -- authorization will not help'),
404: ('Not Found', 'Nothing matches the given URI'),
405: ('Method Not Allowed',
'Specified method is invalid for this server.'),
406: ('Not Acceptable',
'URI not available in preferred format.'),
407: ('Proxy Authentication Required',
'You must authenticate with '
'this proxy before proceeding.'),
408: ('Request Time-out',
'Request timed out; try again later.'),
409: ('Conflict', 'Request conflict.'),
410: ('Gone',
'URI no longer exists and has been permanently removed.'),
411: ('Length Required', 'Client must specify Content-Length.'),
412: ('Precondition Failed',
'Precondition in headers is false.'),
413: ('Request Entity Too Large', 'Entity is too large.'),
414: ('Request-URI Too Long', 'URI is too long.'),
415: ('Unsupported Media Type',
'Entity body in unsupported format.'),
416: ('Requested Range Not Satisfiable',
'Cannot satisfy request range.'),
417: ('Expectation Failed',
'Expect condition could not be satisfied.'),
500: ('Internal error', 'Server got itself in trouble'),
501: ('Not Implemented',
'Server does not support this operation'),
502: ('Bad Gateway',
'Invalid responses from another server/proxy.'),
503: ('Service temporarily overloaded',
'The server cannot '
'process the request due to a high load'),
504: ('Gateway timeout',
'The gateway server did not receive a timely response'),
505: ('HTTP Version not supported', 'Cannot fulfill request.'),
}
# TODO: handle 503's, or retries for other reasons ...
def api(method, args):
maxRetries = 5
while maxRetries > 0:
# delicious api asks for a 1 second delay between calls: drag!
# impose it here in hardcode fashion; an improvement would
# be to latch the time of last api call and sleep to the next 1s
# interval. whatever.
#
time.sleep(1)
url = baseUrl + method
request = urllib2.Request(url, args)
b64 = base64.encodestring('%s:%s' % (username, password))[:-1]
request.add_header('Authorization', 'Basic %s' % b64)
try:
data = urllib2.urlopen(request)
return True,data
except IOError, e:
code = e.code
if code in httpresponses:
print "%d: %s, %s" % \
(code, httpresponses[code][0], httpresponses[code][1])
else:
print "%d: unknown code" % code
if code == 503:
time.sleep(1)
maxRetries -= 1
except httplib.BadStatusLine, line:
# occasionally urllib2 gets a response that it chokes on
# haven't caught one live yet to debug further ...
print "httplib.BadStatusLine"
print line
maxRetries -= 1
return False,data
def getPosts():
posts = []
success,data = api("posts/all", None)
if not success:
return
# create Reader object
reader = Sax2.Reader()
# parse the document
doc = reader.fromStream(data)
for post in doc.getElementsByTagName("post"):
posts.append(post.getAttribute("href"))
return posts
def getTags():
tags = []
success,data = api("tags/get", None)
if not success:
return
# create Reader object
reader = Sax2.Reader()
# parse the document
doc = reader.fromStream(data)
for tag in doc.getElementsByTagName("tag"):
tags.append(tag.getAttribute("tag"))
return tags
def renameTag(old, new):
args = {
"old" : old,
"new" : new
}
urlArgs = urllib.urlencode(args)
success,data = api("tags/rename?", urlArgs)
if not success:
return False
# create Reader object
reader = Sax2.Reader()
# parse the document
doc = reader.fromStream(data)
# XML response is always:
#done
#
# In DOM the "result" tag is a NodeList with 1 Element Node,
# said Element Node has 0 attributes and 1 childNode, whose nodeValue is "done"
#
result = doc.getElementsByTagName("result")
if len(result) != 1:
print "unexpected xml response:"
print data
return False
if hasattr(result[0], "childNodes") and result[0].childNodes[0].nodeValue == "done":
print "renamed"
return True
print "unexpected xml data"
print data
return result
return False
def post(url, desc, tags):
tagsString = string.join(tags)
args = {
"url" : url,
"description" : desc,
"tags" : tagsString,
"replace" : "no" # docs say no is default, but play it safe
}
urlArgs = urllib.urlencode(args)
success,data = api("posts/add?", urlArgs)
if not success:
return
# create Reader object
reader = Sax2.Reader()
# parse the document
doc = reader.fromStream(data)
# XML response if the post was successful:
#
#
# XML response if the post failed:
#
#
# In DOM the "result" tag is a NodeList with 1 Element Node,
# said Element Node has 1 attribute, "code", and 0 children
#
result = doc.getElementsByTagName("result")
if len(result) != 1:
print "unexpected xml response: length != 1"
print "post failed for:", desc
print "\ttags were:", tagsString
print data
return False
code = result[0].getAttribute("code")
if code == "done":
print "successfully posted:", desc
print "\ttags were:", tagsString
return result
print "unexpected xml response: %s" % code
print "post failed for:", desc
print "\ttags were:", tagsString
return False
#################################################
#
# HTML parser for Firefox bookmarks.html format
#
# NB: This was derived by examining the format of my bookmarks.html,
# nothing more rigorous than that.
#
# I haven't looked for a specification of the format, or even
# looked at any other samples of Firefox bookmark output, so there
# are certain to be holes in this logic.
#
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
# this is python: explicitly invoke base class constructor
HTMLParser.__init__(self)
self.inH3 = False
self.inA = False
self.tagCount = 0
self.tags = []
self.currentTag = ""
self.href = ""
self.description = ""
self.ignore = ""
def setBaseTag(self, baseTag):
self.tags.append(baseTag)
def setIgnoreUrls(self, ignore):
self.ignore = ignore
# remove white space
# remove apostrophes, quote, double-quotes, colons, commas
def normalizeText(self, text):
text = text.replace('\'', '')
text = text.replace('"', '')
text = text.replace('`', '')
text = text.replace(':', '')
text = text.replace(',', '')
text = text.replace(' ', '')
text = text.replace(' ', '')
return text
def handle_starttag(self, tag, attrs):
if tag == "a":
self.inA = True
for attr in attrs:
if attr[0] == "href":
self.href = attr[1]
if tag == "h3":
self.inH3 = True
self.tagCount += 1
if tag == "dl":
pass
#print "Entering folder list; tags are", self.tags
def handle_endtag(self, tag):
if tag == "h3":
self.tags.append(self.currentTag)
self.currentTag = ""
self.inH3 = False
if tag == "a":
if debug == True:
print
print "href =", self.href
print "description =", self.description
print "tags =", self.tags
# validate href
validHref = True
if len(self.href) == 0:
validHref = False
if not self.href.split(":")[0] in ["http", "https", "news", "ftp"]:
validHref = False
if self.href in self.ignore:
validHref = False
# actually post here, make sure there's a url to post
if validHref:
post(self.href, self.description, self.tags)
self.href = ""
self.description = ""
self.inA = False
# exiting a dl means end of a bookmarks folder, pop the last tag off
if tag == "dl":
self.tags = self.tags[:-1]
# handle any data: note that this will miss the "escaped" stuff
# fix this by adding handle_charref, etc methods
def handle_data(self, data):
if self.inH3:
self.currentTag += self.normalizeText(data)
if self.inA:
self.description += data
def doit():
# retrieve the full list of posts to avoid requests for dup's
print "Retrieving your existing posts to speed the upload ..."
posts = getPosts()
print "Got them: you have %d posts on del.icio.us now" % len(posts)
# construct and configure the parser
parser = MyHTMLParser()
if baseTag and len(baseTag) > 0:
parser.setBaseTag(baseTag)
parser.setIgnoreUrls(posts)
# initiate the parse; this will submit requests to delicious
parser.feed(open(bookmarks).read())
# cleanup
parser.close()
def usage():
print "Usage: deli.py --bookmarks= --username= --password=]"
print " bookmarks, username, password should be self explanatory"
print " tags is a white-space separated list of tags to apply to all bookmarks"
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "b:u:p:t:",
["bookmarks=", "username=", "password=", "tags="])
except getopt.GetoptError:
# print help information and exit:
usage()
sys.exit(2)
# use the globals
global bookmarks
global username
global password
global baseTag
for o, a in opts:
if o in ("-b", "--bookmarks"):
bookmarks = a
if o in ("-u", "--username"):
username = a
if o in ("-p", "--password"):
password = a
if o in ("-t", "--tags"):
baseTag = a
if not bookmarks or not username or not password:
usage()
sys.exit(1)
# go forth
doit()
main()