Basic Python script to convert a list of URLs into Zotero RDF for import
Here is a simple (perhaps throwaway?) script for converting a file containing a list of URLs into a Zotero RDF file suitable for importing.
I found it useful; maybe someone else will :)
In addition to installing Python, you will also need to install the lxml module.
Copy the contents below into your favourite text editor and save the file as convert.py or a similar name. On the command line use: -
This should produce a new file called myfilename.rdf suitable for importing into Zotero.
Script is here: -
I found it useful; maybe someone else will :)
In addition to installing Python, you will also need to install the lxml module.
Copy the contents below into your favourite text editor and save the file as convert.py or a similar name. On the command line use: -
> convert.py myfilename
This should produce a new file called myfilename.rdf suitable for importing into Zotero.
Script is here: -
from sys import argv, stdout
from os.path import exists
from datetime import datetime
from lxml.html import parse
from codecs import getwriter
from cgi import escape
stdout = getwriter('mbcs')(stdout)
usage_string = """
usage: convert.py <link list>
where <link list> is file containing URLs, one on each line
note: this script will connect to each url and attempt to download the
URL's title.
"""
if len(argv) != 2 or not exists(argv[1]):
print usage_string
exit(1)
file_path = argv[1]
rdf_header = """<?xml version="1.0"?>
<RDF:RDF xmlns:z="http://www.zotero.org/namespaces/export#"
xmlns:link="http://purl.org/rss/1.0/modules/link/"
xmlns:vcard="http://nwalsh.com/rdf/vCard#"
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:bib="http://purl.org/net/biblio#"
xmlns:NC="http://home.netscape.com/NC-rdf#"
xmlns:RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
"""
rdf_footer = """
</RDF:RDF>
"""
date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
input_file = open(file_path,"r")
output_file = open(file_path + ".rdf", "w")
output_file = getwriter('mbcs')(output_file)
output_file.write(rdf_header)
for i,l in enumerate(input_file):
url = escape(l.strip())
try:
title = escape(parse(url).find(".//title").text.strip())
except:
title = url
output_file.write("""
<dcterms:URI RDF:about="rdf:#zotero_convert_%s"
RDF:value="%s" />
<z:Attachment RDF:ID="item_%s"
z:itemType="attachment"
dc:title="%s"
dcterms:dateSubmitted="%s"
link:type="text/html"
link:charset="25">
<dc:identifier RDF:resource="rdf:#zotero_convert_%s"/>
</z:Attachment>
""" % (i, url, i, title, date_now, i))
output_file.write(rdf_footer)
output_file.close()
Also, an aside: this reminds of me of just how bad the current RDF support is in Zotero. Hopefully we can get that fixed soon. I have some other Python code that works pretty well for dealing with the RDF.
BTW, are you sure you have that RDF encoded right? Seems odd to put a title property on the z:Attachment resource.
You're welcome to update it if you really want to; however, I won't be updating it myself since I don't need it anymore...
This is the generic ZoteroRDF writer
import logging
from xml.sax.saxutils import escape
import lxml.etree as xmlet
logger = logging.getLogger(__name__)
class ZoteroRDF:
def __init__(self, filename):
self.filename = filename
def __enter__(self):
self.writer = _ZoteroRDFWriter(self.filename)
return self.writer
def __exit__(self, type, value, traceback):
self.writer._exit()
class _ZoteroRDFWriter:
HEADER = """<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:z="http://www.zotero.org/namespaces/export#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:vcard="http://nwalsh.com/rdf/vCard#"
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:bib="http://purl.org/net/biblio#"
xmlns:link="http://purl.org/rss/1.0/modules/link/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/">
"""
FOOTER = "\n</rdf:RDF>"
TAG_MAP = {
'url': dict(placeholder="<dc:identifier><dcterms:URI><rdf:value>{}</rdf:value></dcterms:URI></dc:identifier>"),
"title": dict(tagname="dc:title"),
"date": dict(tagname="dcterms:date"),
"year": dict(tagname="dcterms:date"),
"publisher": dict(tagname="dcterms:publisher"),
"journal": dict(tagname="dcterms:source"),
"volume": dict(tagname="dcterms:volume"),
"number": dict(tagname="dcterms:issue"),
"pages": dict(tagname="bib:pages"),
"doi": dict(tagname="dcterms:identifier"),
"abstract": dict(tagname="dcterms:abstract"),
"extra": dict(tagname="dc:description"),
"language": dict(tagname="z:language"),
"keywords": dict(tagname="dc:subject"),
"keyword": dict(tagname="dc:subject"),
"tags": dict(tagname="dc:subject"),
"tag": dict(tagname="dc:subject"),
"reporttype": dict(tagname="z:type"),
"reportno": dict(tagname="prism:number"),
"organization": dict(placeholder="\n<dc:publisher><foaf:Organization><vcard:adr><vcard:Address><vcard:locality>{place}</vcard:locality></vcard:Address></vcard:adr><foaf:name>{name}</foaf:name></foaf:Organization></dc:publisher>"),
}
def __init__(self, filename):
self.filename = filename
self.fd = open(self.filename, "wt")
self.fd.write(self.HEADER)
def _exit(self):
self.fd.write(self.FOOTER)
self.fd.close()
def add_bib_item(self, item):
if 'id' not in item:
raise ValueError("Missing id")
itemid = item['id']
self.fd.write(f"\n\n<bib:Report rdf:about=\"#item_{itemid}\">\n <z:itemType>report</z:itemType>\n")
for key, value in item.items():
if key in self.TAG_MAP:
if 'function' in self.TAG_MAP[key]:
self.TAG_MAP[key]['function'](self.fd, value)
elif 'placeholder' in self.TAG_MAP[key]:
if type(value) is dict:
self.fd.write(self.TAG_MAP[key]['placeholder'].format_map(value))
logger.debug(self.TAG_MAP[key]['placeholder'].format_map(value))
else:
self.fd.write(self.TAG_MAP[key]['placeholder'].format(escape(value)))
logger.debug(self.TAG_MAP[key]['placeholder'].format(escape(value)))
elif 'tagname' in self.TAG_MAP[key]:
if type(value) is list:
self.fd.write("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
logger.debug("\n".join(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(v)}</{self.TAG_MAP[key]['tagname']}>" for v in value))
else:
self.fd.write(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
logger.debug(f"\n<{self.TAG_MAP[key]['tagname']}>{escape(value)}</{self.TAG_MAP[key]['tagname']}>")
else:
raise Exception(f"Unknown tag type {self.TAG_MAP[key]} - malformed TAG_MAP?")
elif key in ("authors", "author", "creator", "creators"):
self.fd.write("\n<bib:authors><rdf:Seq>")
if type(value) is not list:
value = [value]
self.fd.write("\n".join("<rdf:li><foaf:Person><foaf:givenName>{first}</foaf:givenName><foaf:surname>{last}</foaf:surname></foaf:Person></rdf:li>".format_map(v) for v in value))
self.fd.write("\n</rdf:Seq></bib:authors>")
elif key in ("id", ):
pass
else:
raise ValueError("Key {} not supported".format(key))
self.fd.write(f"\n<link:link rdf:resource=\"#attach_{itemid}\"/>")
self.fd.write("\n</bib:Report>")
def add_attachment(self, item):
"""" id, path, name, type """""
if 'id' not in item or 'path' not in item or 'name' not in item or 'type' not in item:
raise ValueError("Missing id, path or name")
self.fd.write("""
<z:Attachment rdf:about="#attach_{id}">
<z:itemType>attachment</z:itemType>
<rdf:resource rdf:resource="{path}"/>
<dc:title>{name}</dc:title>
<link:type>{type}</link:type>
</z:Attachment>""".format_map(item)
)
and this is a simple usage:
with ZoteroRDF("00_bd.rdf") as rdf:
for d in your_list_of_items:
itemid = d['identifier']
item = dict(
id=itemid,
title=d['title'],
date=d['date'],
author=authors,
abstract=abstract,
extra=d['publisher'],
keywords=[d['subject_1'], d['subject_2'], d['subject_3']],
language="es"
)
path = '{}.pdf'.format(itemid)
if Path(path).exists():
rdf.add_bib_item(item)
rdf.add_attachment(dict(id=itemid, path=path, name=f'{itemid}.pdf', type='application/pdf'))
else:
print("Missing {}. File not here".format(path))