AlterNet translator submission
I have created a AlterNet translator for http://www.alternet.org and it seems to work fine. Metadata is absent so the scraper is not elegant, but it works.
However, I am unsure about how to submit it for use in the default translator database. Please point me in the right direction
Also, I am likely to submit more translators in the near future, so let me know if this code needs is not meeting the standard; I will work on it until it is sufficient.
REPLACE INTO translators VALUES ('ea531652-cdeb-4ec2-940e-627d4b107263', '1.0.0b4.r1', '', '2008-06-19 22:27:17', '0', '100', '4', 'AlterNet', 'Jesse Johnson', '^http://(?:www\.)alternet.org',
'function detectWeb(doc, url) {
// identifies articles according to the presence of an article ID
// number in the URL
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
// ordinary aritcle
var id = url.toString().substr(index + 1, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
return "magazineArticle";
}
//columnist or blog article
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id) && url.toString().search(''blog'') == -1) {
return "magazineArticle";
}
else if (Number(id)) {
return "blogPost";
}
}
return null;
}',
'function scrape(doc, url, title) {
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
// ordinary aritcle
var id = url.toString().substr(index + 1, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
var newItem = new Zotero.Item("magazineArticle");
}
//columnist or blog article
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id) && url.toString().search(''blog'') == -1) {
var newItem = new Zotero.Item("magazineArticle");
}
else if (Number(id)) {
var newItem = new Zotero.Item("blogPost");
}
}
newItem.url = url;
newItem.title = title;
if (newItem.itemType == "magazineArticle") {
newItem.publicationTitle = "AlterNet";
newItem.repository = "alternet.org";
}
else if (newItem.itemType == "blogPost") {
newItem.websiteType = "AlterNet Blog";
}
// general scraping variables
var xpath;
// author
if (newItem.itemType == "magazineArticle") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"author")]'';
}
else if (newItem.itemType == "blogPost") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"bloggers")]'';
}
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (temp) {
var author = Zotero.Utilities.trimInternal(temp.textContent);
if(author.substr(0, 3).toLowerCase() == "by ") {
author = author.substr(3);
}
var authors = author.split(",");
for each (var author in authors) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
}
}
// date
if (newItem.itemType == "magazineArticle") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"date")]'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var date = Zotero.Utilities.strToDate(temp.textContent);
}
else if (newItem.itemType == "blogPost") {
xpath = ''//p[@class="storybyline"]/b'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var begin = temp.textContent.lastIndexOf(" on ");
temp = temp.textContent.substr(begin + 4);
var date = Zotero.Utilities.strToDate(temp.substr(0, temp.length - 1));
}
if (date != null) {
date.month = date.month + 1;
if (date.month < 10) {
date = date.year + ''-'' + ''0'' + date.month + ''-'' + date.day;
}
else {
date = date.year+ ''-'' + date.month + ''-'' + date.day;
}
newItem.date = date;
}
// abstract
xpath = ''//div[@class="teaser"]//div[contains(@class,"teaser")]'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (temp) {
newItem.abstractNote = Zotero.Utilities.trimInternal(temp.textContent);
}
// article snapshot
// grabs 5-digit article code from url and uses it to derive printable page url for use in article snapshot
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
var printurl;
// ordinary article
var id = url.toString().substr(index + 1, 5);
if (Number(id)) {
printurl = "http://www.alternet.org/module/printversion/" + id;
newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"});
}
// columnist article
else {
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
printurl = "http://www.alternet.org/module/printversion/" + id;
if (newItem.itemType == "blogPost") {
printurl += "/?type=blog";
}
newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"});
}
}
}
newItem.complete();
}
function doWeb(doc, url) {
// ordinary and columnist articles
var xpath = ''//p[@class="storyheadline"]'';
var title;
if (title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
scrape(doc, url, title.textContent);
}
return null;
}');
However, I am unsure about how to submit it for use in the default translator database. Please point me in the right direction
Also, I am likely to submit more translators in the near future, so let me know if this code needs is not meeting the standard; I will work on it until it is sufficient.
REPLACE INTO translators VALUES ('ea531652-cdeb-4ec2-940e-627d4b107263', '1.0.0b4.r1', '', '2008-06-19 22:27:17', '0', '100', '4', 'AlterNet', 'Jesse Johnson', '^http://(?:www\.)alternet.org',
'function detectWeb(doc, url) {
// identifies articles according to the presence of an article ID
// number in the URL
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
// ordinary aritcle
var id = url.toString().substr(index + 1, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
return "magazineArticle";
}
//columnist or blog article
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id) && url.toString().search(''blog'') == -1) {
return "magazineArticle";
}
else if (Number(id)) {
return "blogPost";
}
}
return null;
}',
'function scrape(doc, url, title) {
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
// ordinary aritcle
var id = url.toString().substr(index + 1, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
var newItem = new Zotero.Item("magazineArticle");
}
//columnist or blog article
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id) && url.toString().search(''blog'') == -1) {
var newItem = new Zotero.Item("magazineArticle");
}
else if (Number(id)) {
var newItem = new Zotero.Item("blogPost");
}
}
newItem.url = url;
newItem.title = title;
if (newItem.itemType == "magazineArticle") {
newItem.publicationTitle = "AlterNet";
newItem.repository = "alternet.org";
}
else if (newItem.itemType == "blogPost") {
newItem.websiteType = "AlterNet Blog";
}
// general scraping variables
var xpath;
// author
if (newItem.itemType == "magazineArticle") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"author")]'';
}
else if (newItem.itemType == "blogPost") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"bloggers")]'';
}
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (temp) {
var author = Zotero.Utilities.trimInternal(temp.textContent);
if(author.substr(0, 3).toLowerCase() == "by ") {
author = author.substr(3);
}
var authors = author.split(",");
for each (var author in authors) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
}
}
// date
if (newItem.itemType == "magazineArticle") {
xpath = ''//p[@class="storybyline"]//a[contains(@href,"date")]'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var date = Zotero.Utilities.strToDate(temp.textContent);
}
else if (newItem.itemType == "blogPost") {
xpath = ''//p[@class="storybyline"]/b'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var begin = temp.textContent.lastIndexOf(" on ");
temp = temp.textContent.substr(begin + 4);
var date = Zotero.Utilities.strToDate(temp.substr(0, temp.length - 1));
}
if (date != null) {
date.month = date.month + 1;
if (date.month < 10) {
date = date.year + ''-'' + ''0'' + date.month + ''-'' + date.day;
}
else {
date = date.year+ ''-'' + date.month + ''-'' + date.day;
}
newItem.date = date;
}
// abstract
xpath = ''//div[@class="teaser"]//div[contains(@class,"teaser")]'';
temp = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (temp) {
newItem.abstractNote = Zotero.Utilities.trimInternal(temp.textContent);
}
// article snapshot
// grabs 5-digit article code from url and uses it to derive printable page url for use in article snapshot
var index = url.toString().indexOf(''.org/'') + 5;
index += url.toString().substr(index).indexOf(''/'');
if (index != -1) {
var printurl;
// ordinary article
var id = url.toString().substr(index + 1, 5);
if (Number(id)) {
printurl = "http://www.alternet.org/module/printversion/" + id;
newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"});
}
// columnist article
else {
index += url.toString().substr(index + 1).indexOf(''/'');
id = url.toString().substr(index + 2, 5);
Zotero.Utilities.cleanString(id);
if (Number(id)) {
printurl = "http://www.alternet.org/module/printversion/" + id;
if (newItem.itemType == "blogPost") {
printurl += "/?type=blog";
}
newItem.attachments.push({url:printurl, title:"AlterNet Article Snapshot", mimeType:"text/html"});
}
}
}
newItem.complete();
}
function doWeb(doc, url) {
// ordinary and columnist articles
var xpath = ''//p[@class="storyheadline"]'';
var title;
if (title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
scrape(doc, url, title.textContent);
}
return null;
}');