idunn.no translator
Hi there,
idunn.no is a major source of journal articles in Norway, as it is the source of all the journals from the journal association. I have created a translator for single articles. It's fragile and painful, because there is no real metadata shown on the site. The code for getting page numbers is especially painful.
Here goes:
function detectWeb(doc, url) {
if (doc.title.match(/idunn\.no - [\w\s]+ - [\w\s]+ - \d+ - Nr \d+ - [\w\s]+/)) {
return "journalArticle";
}
}
function doWeb(doc, url) {
scrape(doc,url);
}
function scrape(doc, url) {
// Get from the text itself
var xpath = "//div[@class='documentDIV']";
var allRefText = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().innerHTML;
var authorNodes = doc.evaluate("//span[@class='author']", doc, null, XPathResult.ANY_TYPE, null);
var authors = Array();
var author;
while (author = authorNodes.iterateNext()) {
var authorText = author.innerHTML.replace(" ", " ").replace("Av ", "");
authors.push(Zotero.Utilities.cleanAuthor(authorText, "author"))
}
// This gets the _first_ page
var pagestart = getItem(allRefText, /Side (\d+)/);
// Get the last page
var lastpage;
var pagebreaks = doc.evaluate("//div[@class='pagebreak']", doc, null, XPathResult.ANY_TYPE, null);
while (iterator = pagebreaks.iterateNext()) {
lastpage = getItem(iterator.innerHTML, /Side (\d+)/);
}
// Get from HTML title attribute
var metadata = doc.title.match(/idunn\.no - [\w\s]+ - ([\w\s]+) - (\d+) - Nr (\d+) - (.*)/);
var title = Zotero.Utilities.cleanString(metadata[4]);
var journal = metadata[1];
var year = metadata[2];
var issue = metadata[3];
var newArticle = new Zotero.Item('journalArticle');
newArticle.title = title;
newArticle.publicationTitle = journal;
newArticle.year = year;
newArticle.issue = issue;
newArticle.creators = authors;
newArticle.pages = pagestart + "-" + lastpage;
newArticle.url = url;
newArticle.attachments.push({url:url, title:"Snapshot from idunn.no", snapshot:true, mimeType:"text/html"});
Zotero.debug(newArticle);
newArticle.complete();
Zotero.wait();
}
function getItem(reftext, re) {
try {
var item = reftext.match(re);
return item[1];
} catch(err) {
return "";
}
}
idunn.no is a major source of journal articles in Norway, as it is the source of all the journals from the journal association. I have created a translator for single articles. It's fragile and painful, because there is no real metadata shown on the site. The code for getting page numbers is especially painful.
Here goes:
function detectWeb(doc, url) {
if (doc.title.match(/idunn\.no - [\w\s]+ - [\w\s]+ - \d+ - Nr \d+ - [\w\s]+/)) {
return "journalArticle";
}
}
function doWeb(doc, url) {
scrape(doc,url);
}
function scrape(doc, url) {
// Get from the text itself
var xpath = "//div[@class='documentDIV']";
var allRefText = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().innerHTML;
var authorNodes = doc.evaluate("//span[@class='author']", doc, null, XPathResult.ANY_TYPE, null);
var authors = Array();
var author;
while (author = authorNodes.iterateNext()) {
var authorText = author.innerHTML.replace(" ", " ").replace("Av ", "");
authors.push(Zotero.Utilities.cleanAuthor(authorText, "author"))
}
// This gets the _first_ page
var pagestart = getItem(allRefText, /Side (\d+)/);
// Get the last page
var lastpage;
var pagebreaks = doc.evaluate("//div[@class='pagebreak']", doc, null, XPathResult.ANY_TYPE, null);
while (iterator = pagebreaks.iterateNext()) {
lastpage = getItem(iterator.innerHTML, /Side (\d+)/);
}
// Get from HTML title attribute
var metadata = doc.title.match(/idunn\.no - [\w\s]+ - ([\w\s]+) - (\d+) - Nr (\d+) - (.*)/);
var title = Zotero.Utilities.cleanString(metadata[4]);
var journal = metadata[1];
var year = metadata[2];
var issue = metadata[3];
var newArticle = new Zotero.Item('journalArticle');
newArticle.title = title;
newArticle.publicationTitle = journal;
newArticle.year = year;
newArticle.issue = issue;
newArticle.creators = authors;
newArticle.pages = pagestart + "-" + lastpage;
newArticle.url = url;
newArticle.attachments.push({url:url, title:"Snapshot from idunn.no", snapshot:true, mimeType:"text/html"});
Zotero.debug(newArticle);
newArticle.complete();
Zotero.wait();
}
function getItem(reftext, re) {
try {
var item = reftext.match(re);
return item[1];
} catch(err) {
return "";
}
}