Basic Python script to convert a list of URLs into Zotero RDF for import

bsdz · April 27, 2009

Here is a simple (perhaps throwaway?) script for converting a file containing a list of URLs into a Zotero RDF file suitable for importing.

I found it useful; maybe someone else will :)

In addition to installing Python, you will also need to install the lxml module.

Copy the contents below into your favourite text editor and save the file as convert.py or a similar name. On the command line use: -


> convert.py myfilename

This should produce a new file called myfilename.rdf suitable for importing into Zotero.

Script is here: -


from sys import argv, stdout
from os.path import exists
from datetime import datetime
from lxml.html import parse
from codecs import getwriter
from cgi import escape

stdout = getwriter('mbcs')(stdout)


usage_string = """
usage: convert.py <link list>
          where <link list> is file containing URLs, one on each line
note: this script will connect to each url and attempt to download the 
      URL's title.
"""

if len(argv) != 2 or not exists(argv[1]):
    print usage_string
    exit(1)
    
file_path = argv[1]

    
rdf_header = """<?xml version="1.0"?>
<RDF:RDF xmlns:z="http://www.zotero.org/namespaces/export#"
         xmlns:link="http://purl.org/rss/1.0/modules/link/"
         xmlns:vcard="http://nwalsh.com/rdf/vCard#"
         xmlns:foaf="http://xmlns.com/foaf/0.1/"
         xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/"
         xmlns:dcterms="http://purl.org/dc/terms/"
         xmlns:dc="http://purl.org/dc/elements/1.1/"
         xmlns:bib="http://purl.org/net/biblio#"
         xmlns:NC="http://home.netscape.com/NC-rdf#"
         xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
"""

rdf_footer = """
</RDF:RDF>
"""

date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

input_file = open(file_path,"r")
output_file = open(file_path + ".rdf", "w")
output_file = getwriter('mbcs')(output_file)
output_file.write(rdf_header)

for i,l in enumerate(input_file):
    url = escape(l.strip())
    
    try:
        title = escape(parse(url).find(".//title").text.strip())
    except:
        title = url
        
    output_file.write("""
      <dcterms:URI RDF:about="rdf:#zotero_convert_%s"
                   RDF:value="%s" />
      <z:Attachment RDF:ID="item_%s"
                       z:itemType="attachment"
                       dc:title="%s"
                       dcterms:dateSubmitted="%s"
                       link:type="text/html"
                       link:charset="25">
        <dc:identifier RDF:resource="rdf:#zotero_convert_%s"/>
      </z:Attachment>
    """ % (i, url, i, title, date_now, i))

output_file.write(rdf_footer)
output_file.close()

bdarcus · April 27, 2009

Lxml is a pretty heavy-weight requirement for a pretty simple script. You might consider doing something like I do in this code, where you fall back to element tree (included in the standard library now). That way, people use lxml if they want for a little extra speed, but it works with etree.

Also, an aside: this reminds of me of just how bad the current RDF support is in Zotero. Hopefully we can get that fixed soon. I have some other Python code that works pretty well for dealing with the RDF.

BTW, are you sure you have that RDF encoded right? Seems odd to put a title property on the z:Attachment resource.

bsdz · April 27, 2009

As I said before it's a -throwaway- script. I just used it to successfully import 35 notebooks from Google and Zoho Notebook so -yes- it definitely works.

You're welcome to update it if you really want to; however, I won't be updating it myself since I don't need it anymore...

jgrigera · April 11, 2022

For some reason this is the landing page of most search engines to "Python Create Zotero RDF". As @bsdz mentioned many years ago, these are always throwaway scripts... but here are my next 2 cents.

This is the generic ZoteroRDF writer


import logging
from xml.sax.saxutils import escape
import lxml.etree as xmlet

logger = logging.getLogger(__name__)


class ZoteroRDF:
    def __init__(self, filename):
        self.filename = filename

    def __enter__(self):
        self.writer = _ZoteroRDFWriter(self.filename)
        return self.writer

    def __exit__(self, type, value, traceback):
        self.writer._exit()


class _ZoteroRDFWriter:
    HEADER = """<rdf:RDF
 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 xmlns:z="http://www.zotero.org/namespaces/export#"
 xmlns:dc="http://purl.org/dc/elements/1.1/"
 xmlns:vcard="http://nwalsh.com/rdf/vCard#"
 xmlns:foaf="http://xmlns.com/foaf/0.1/"
 xmlns:bib="http://purl.org/net/biblio#"
 xmlns:link="http://purl.org/rss/1.0/modules/link/"
 xmlns:dcterms="http://purl.org/dc/terms/"
 xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/">
"""
    FOOTER = "\n</rdf:RDF>"

    TAG_MAP = {
        'url': dict(placeholder="<dc:identifier><dcterms:URI><rdf:value>{}</rdf:value></dcterms:URI></dc:identifier>"),
        "title": dict(tagname="dc:title"),
        "date": dict(tagname="dcterms:date"),
        "year": dict(tagname="dcterms:date"),
        "publisher": dict(tagname="dcterms:publisher"),
        "journal": dict(tagname="dcterms:source"),
        "volume": dict(tagname="dcterms:volume"),
        "number": dict(tagname="dcterms:issue"),
        "pages": dict(tagname="bib:pages"),
        "doi": dict(tagname="dcterms:identifier"),
        "abstract": dict(tagname="dcterms:abstract"),
        "extra": dict(tagname="dc:description"),
        "language": dict(tagname="z:language"),
        "keywords": dict(tagname="dc:subject"),
        "keyword": dict(tagname="dc:subject"),
        "tags": dict(tagname="dc:subject"),
        "tag": dict(tagname="dc:subject"),
        "reporttype": dict(tagname="z:type"),
        "reportno": dict(tagname="prism:number"),
        "organization": dict(placeholder="\n<dc:publisher><foaf:Organization><vcard:adr><vcard:Address><vcard:locality>{place}</vcard:locality></vcard:Address></vcard:adr><foaf:name>{name}</foaf:name></foaf:Organization></dc:publisher>"),
    }

    def __init__(self, filename):
        self.filename = filename
        self.fd = open(self.filename, "wt")
        self.fd.write(self.HEADER)

    def _exit(self):
        self.fd.write(self.FOOTER)
        self.fd.close()

    def add_bib_item(self, item):
        if 'id' not in item:
            raise ValueError("Missing id")
        itemid = item['id']

        self.fd.write(f"\n\n<bib:Report rdf:about=\"#item_{itemid}\">\n  <z:itemType>report</z:itemType>\n")

        for key, value in item.items():

            if key in self.TAG_MAP:
                if 'function' in self.TAG_MAP[key]:
                    self.TAG_MAP[key]['function'](self.fd, value)
                elif 'placeholder' in self.TAG_MAP[key]:
                    if type(value) is dict:
                        self.fd.write(self.TAG_MAP[key]['placeholder'].format_map(value))
                        logger.debug(self.TAG_MAP[key]['placeholder'].format_map(value))
                    else:
                        self.fd.write(self.TAG_MAP[key]['placeholder'].format(escape(value)))
                        logger.debug(self.TAG_MAP[key]['placeholder'].format(escape(value)))
                elif 'tagname' in self.TAG_MAP[key]:
                    if type(value) is list:
                        self.fd.write("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
                        logger.debug("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
                    else:
                        self.fd.write(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
                        logger.debug(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
                else:
                    raise Exception(f"Unknown tag type {self.TAG_MAP[key]} - malformed TAG_MAP?")

            elif key in ("authors", "author", "creator", "creators"):
                self.fd.write("\n<bib:authors><rdf:Seq>")
                if type(value) is not list:
                    value = [value]
                self.fd.write("\n".join("<rdf:li><foaf:Person><foaf:givenName>{first}</foaf:givenName><foaf:surname>{last}</foaf:surname></foaf:Person></rdf:li>".format_map(v) for v in value))
                self.fd.write("\n</rdf:Seq></bib:authors>")
            elif key in ("id", ):
                pass
            else:
                raise ValueError("Key {} not supported".format(key))

        self.fd.write(f"\n<link:link rdf:resource=\"#attach_{itemid}\"/>")
        self.fd.write("\n</bib:Report>")

    def add_attachment(self, item):
        """" id, path, name, type """""
        if 'id' not in item or 'path' not in item or 'name' not in item or 'type' not in item:
            raise ValueError("Missing id, path or name")

        self.fd.write("""
    <z:Attachment rdf:about="#attach_{id}">
        <z:itemType>attachment</z:itemType>
        <rdf:resource rdf:resource="{path}"/>
        <dc:title>{name}</dc:title>
        <link:type>{type}</link:type>
    </z:Attachment>""".format_map(item)
                      )

and this is a simple usage:


    with ZoteroRDF("00_bd.rdf") as rdf:
        for d in your_list_of_items:
            itemid = d['identifier']
            item = dict(
                id=itemid,
                title=d['title'],
                date=d['date'],
                author=authors,
                abstract=abstract,
                extra=d['publisher'],
                keywords=[d['subject_1'], d['subject_2'], d['subject_3']],
                language="es"
            )

            path = '{}.pdf'.format(itemid)
            if Path(path).exists():
                rdf.add_bib_item(item)
                rdf.add_attachment(dict(id=itemid, path=path, name=f'{itemid}.pdf', type='application/pdf'))
            else:
                print("Missing {}. File not here".format(path))