Basic Python script to convert a list of URLs into Zotero RDF for import

Here is a simple (perhaps throwaway?) script for converting a file containing a list of URLs into a Zotero RDF file suitable for importing.

I found it useful; maybe someone else will :)

In addition to installing Python, you will also need to install the lxml module.

Copy the contents below into your favourite text editor and save the file as convert.py or a similar name. On the command line use: -


> convert.py myfilename


This should produce a new file called myfilename.rdf suitable for importing into Zotero.

Script is here: -


from sys import argv, stdout
from os.path import exists
from datetime import datetime
from lxml.html import parse
from codecs import getwriter
from cgi import escape

stdout = getwriter('mbcs')(stdout)


usage_string = """
usage: convert.py <link list>
where <link list> is file containing URLs, one on each line
note: this script will connect to each url and attempt to download the
URL's title.
"""

if len(argv) != 2 or not exists(argv[1]):
print usage_string
exit(1)

file_path = argv[1]


rdf_header = """<?xml version="1.0"?>
<RDF:RDF xmlns:z="http://www.zotero.org/namespaces/export#"
xmlns:link="http://purl.org/rss/1.0/modules/link/"
xmlns:vcard="http://nwalsh.com/rdf/vCard#"
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:bib="http://purl.org/net/biblio#"
xmlns:NC="http://home.netscape.com/NC-rdf#"
xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
"""

rdf_footer = """
</RDF:RDF>
"""

date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

input_file = open(file_path,"r")
output_file = open(file_path + ".rdf", "w")
output_file = getwriter('mbcs')(output_file)
output_file.write(rdf_header)

for i,l in enumerate(input_file):
url = escape(l.strip())

try:
title = escape(parse(url).find(".//title").text.strip())
except:
title = url

output_file.write("""
<dcterms:URI RDF:about="rdf:#zotero_convert_%s"
RDF:value="%s" />
<z:Attachment RDF:ID="item_%s"
z:itemType="attachment"
dc:title="%s"
dcterms:dateSubmitted="%s"
link:type="text/html"
link:charset="25">
<dc:identifier RDF:resource="rdf:#zotero_convert_%s"/>
</z:Attachment>
""" % (i, url, i, title, date_now, i))

output_file.write(rdf_footer)
output_file.close()
  • Lxml is a pretty heavy-weight requirement for a pretty simple script. You might consider doing something like I do in this code, where you fall back to element tree (included in the standard library now). That way, people use lxml if they want for a little extra speed, but it works with etree.

    Also, an aside: this reminds of me of just how bad the current RDF support is in Zotero. Hopefully we can get that fixed soon. I have some other Python code that works pretty well for dealing with the RDF.

    BTW, are you sure you have that RDF encoded right? Seems odd to put a title property on the z:Attachment resource.
  • As I said before it's a -throwaway- script. I just used it to successfully import 35 notebooks from Google and Zoho Notebook so -yes- it definitely works.

    You're welcome to update it if you really want to; however, I won't be updating it myself since I don't need it anymore...
  • For some reason this is the landing page of most search engines to "Python Create Zotero RDF". As @bsdz mentioned many years ago, these are always throwaway scripts... but here are my next 2 cents.

    This is the generic ZoteroRDF writer


    import logging
    from xml.sax.saxutils import escape
    import lxml.etree as xmlet

    logger = logging.getLogger(__name__)


    class ZoteroRDF:
    def __init__(self, filename):
    self.filename = filename

    def __enter__(self):
    self.writer = _ZoteroRDFWriter(self.filename)
    return self.writer

    def __exit__(self, type, value, traceback):
    self.writer._exit()


    class _ZoteroRDFWriter:
    HEADER = """<rdf:RDF
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:z="http://www.zotero.org/namespaces/export#"
    xmlns:dc="http://purl.org/dc/elements/1.1/"
    xmlns:vcard="http://nwalsh.com/rdf/vCard#"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:bib="http://purl.org/net/biblio#"
    xmlns:link="http://purl.org/rss/1.0/modules/link/"
    xmlns:dcterms="http://purl.org/dc/terms/"
    xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/">
    """
    FOOTER = "\n</rdf:RDF>"

    TAG_MAP = {
    'url': dict(placeholder="<dc:identifier><dcterms:URI><rdf:value>{}</rdf:value></dcterms:URI></dc:identifier>"),
    "title": dict(tagname="dc:title"),
    "date": dict(tagname="dcterms:date"),
    "year": dict(tagname="dcterms:date"),
    "publisher": dict(tagname="dcterms:publisher"),
    "journal": dict(tagname="dcterms:source"),
    "volume": dict(tagname="dcterms:volume"),
    "number": dict(tagname="dcterms:issue"),
    "pages": dict(tagname="bib:pages"),
    "doi": dict(tagname="dcterms:identifier"),
    "abstract": dict(tagname="dcterms:abstract"),
    "extra": dict(tagname="dc:description"),
    "language": dict(tagname="z:language"),
    "keywords": dict(tagname="dc:subject"),
    "keyword": dict(tagname="dc:subject"),
    "tags": dict(tagname="dc:subject"),
    "tag": dict(tagname="dc:subject"),
    "reporttype": dict(tagname="z:type"),
    "reportno": dict(tagname="prism:number"),
    "organization": dict(placeholder="\n<dc:publisher><foaf:Organization><vcard:adr><vcard:Address><vcard:locality>{place}</vcard:locality></vcard:Address></vcard:adr><foaf:name>{name}</foaf:name></foaf:Organization></dc:publisher>"),
    }

    def __init__(self, filename):
    self.filename = filename
    self.fd = open(self.filename, "wt")
    self.fd.write(self.HEADER)

    def _exit(self):
    self.fd.write(self.FOOTER)
    self.fd.close()

    def add_bib_item(self, item):
    if 'id' not in item:
    raise ValueError("Missing id")
    itemid = item['id']

    self.fd.write(f"\n\n<bib:Report rdf:about=\"#item_{itemid}\">\n <z:itemType>report</z:itemType>\n")

    for key, value in item.items():

    if key in self.TAG_MAP:
    if 'function' in self.TAG_MAP[key]:
    self.TAG_MAP[key]['function'](self.fd, value)
    elif 'placeholder' in self.TAG_MAP[key]:
    if type(value) is dict:
    self.fd.write(self.TAG_MAP[key]['placeholder'].format_map(value))
    logger.debug(self.TAG_MAP[key]['placeholder'].format_map(value))
    else:
    self.fd.write(self.TAG_MAP[key]['placeholder'].format(escape(value)))
    logger.debug(self.TAG_MAP[key]['placeholder'].format(escape(value)))
    elif 'tagname' in self.TAG_MAP[key]:
    if type(value) is list:
    self.fd.write("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
    logger.debug("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
    else:
    self.fd.write(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
    logger.debug(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
    else:
    raise Exception(f"Unknown tag type {self.TAG_MAP[key]} - malformed TAG_MAP?")

    elif key in ("authors", "author", "creator", "creators"):
    self.fd.write("\n<bib:authors><rdf:Seq>")
    if type(value) is not list:
    value = [value]
    self.fd.write("\n".join("<rdf:li><foaf:Person><foaf:givenName>{first}</foaf:givenName><foaf:surname>{last}</foaf:surname></foaf:Person></rdf:li>".format_map(v) for v in value))
    self.fd.write("\n</rdf:Seq></bib:authors>")
    elif key in ("id", ):
    pass
    else:
    raise ValueError("Key {} not supported".format(key))

    self.fd.write(f"\n<link:link rdf:resource=\"#attach_{itemid}\"/>")
    self.fd.write("\n</bib:Report>")

    def add_attachment(self, item):
    """" id, path, name, type """""
    if 'id' not in item or 'path' not in item or 'name' not in item or 'type' not in item:
    raise ValueError("Missing id, path or name")

    self.fd.write("""
    <z:Attachment rdf:about="#attach_{id}">
    <z:itemType>attachment</z:itemType>
    <rdf:resource rdf:resource="{path}"/>
    <dc:title>{name}</dc:title>
    <link:type>{type}</link:type>
    </z:Attachment>""".format_map(item)
    )



    and this is a simple usage:


    with ZoteroRDF("00_bd.rdf") as rdf:
    for d in your_list_of_items:
    itemid = d['identifier']
    item = dict(
    id=itemid,
    title=d['title'],
    date=d['date'],
    author=authors,
    abstract=abstract,
    extra=d['publisher'],
    keywords=[d['subject_1'], d['subject_2'], d['subject_3']],
    language="es"
    )

    path = '{}.pdf'.format(itemid)
    if Path(path).exists():
    rdf.add_bib_item(item)
    rdf.add_attachment(dict(id=itemid, path=path, name=f'{itemid}.pdf', type='application/pdf'))
    else:
    print("Missing {}. File not here".format(path))

Sign In or Register to comment.