Python code for web-scraping citations and saving them in Zotero

Marcellobil · May 13, 2023

Hi,

I am building a code in Python with Chatgtp that web-scrapes publications from google scholar and saves the citation records in one specific folder of my Zotero. Basically, I want to automatize the process of: 1- sending a query to google scholar; 2- clicking on the link represented by the title of a publication (i.e., the link on the upper left of each record), and, by doing this, going to the webpage of the publication; 3- clicking on the extension/plug-in button of Zotero I have installed on my google chrome browser and, by doing this, saving in Zotero the citation record relative to the publication webpage I accessed at step 2. Despite multiple (displayed between quotation marks at the bottom of this message) modifications, the code cannot achieve the tasks abovementioned. Nor it's a problem of my Zotero account, API, permissions, client tools, or syncing: it's all right there. If anybody achieved to do something reflecting my intentions, please let me know!

Here, between quotation marks, follows the code I used. It can scrape citations from Scholar but is completely unable to save them in Zotero, and it doesn't return error messages.

"!pip install pyzotero
!pip install titlecase

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import requests
import time
from bs4 import BeautifulSoup
import re
import tempfile
import os
from pyzotero.zotero import Zotero
from titlecase import titlecase
from urllib.parse import urljoin

# Your Zotero API key and user ID
zotero_api_key = "yIphlkdQ1zfqbTfcjIuC2CnR"
zotero_user_id = "7793142"

# Connect to your Zotero library
zot = Zotero(zotero_user_id, "user", zotero_api_key)

# NEW CODE: Specify the collection name and find or create the collection
collection_name = "prova dal codice"
existing_collections = zot.collections(q=collection_name)
if existing_collections:
target_collection = existing_collections[0]
else:
target_collection = zot.create_collections([{"name": collection_name}])[0]
# END NEW CODE

print("Target collection name:", target_collection["data"]["name"])
print("Target collection key:", target_collection["key"])

api_key = "0e9cc7d3533e76bdc5de76c0c9ef7a586d71714d1893670116582a7777014384"
search_query = '("basal metabolic rate" OR BMR) ("body temperature" OR Tb) ("lower critical temperature" OR Tlc) (mammal OR bird OR avian) (species OR taxa)'
num_results_per_page = 10
total_pages = 2
delay_seconds = 5

def get_abbreviated_journal_name(full_name):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "nlmcatalog",
"term": f"{full_name}[Title]",
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()
if data['esearchresult']['count'] == '0':
return None

id_list = data['esearchresult']['idlist']
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
params = {
"db": "nlmcatalog",
"id": ",".join(id_list),
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()

for item in data['result'].values():
if 'Title' in item and item['Title'] == full_name:
if 'NlmUniqueID' in item:
return item['NlmUniqueID']
return None

def format_title(title):
title = titlecase(title)
# Capitalize the first word after a colon
title = re.sub(r":\s?(\w)", lambda m: f": {m.group(1).upper()}", title)
# Make sure the genus name is capitalized, and the specific epithet is lowercase
title = re.sub(r"([A-Z][a-z]+\s[a-z]+\s[a-z]+)", lambda m: f"{m.group(1).capitalize()} {m.group(1).split()[1].lower()}", title)
return title

def download_citation(url, title, snippet):
citation_formats = [".bib", ".ris", ".enw"]

response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, "html.parser")

for link in soup.find_all("a", href=True):
href = link["href"]
if any(ext in href for ext in citation_formats):
citation_url = urljoin(url, href)
citation_response = requests.get(citation_url, verify=False)
return citation_response.text
# Fallback when no citation file is available
return {
"itemType": "journalArticle",
"title": title,
"url": url,
"abstractNote": snippet,
}

for page in range(total_pages):
start = page * num_results_per_page
params = {
"engine": "google_scholar",
"q": search_query,
"api_key": api_key,
"num": num_results_per_page,
"start": start
}

response = requests.get("https://serpapi.com/search", params=params)
results = response.json()

for result in results["organic_results"]:
title = result["title"]
if "link" not in result:
print(f"Link not found for title: {title}\n")
continue
link = result["link"]
snippet = result["snippet"]
print(f"Title: {title}\nLink: {link}\nSnippet: {snippet}\n")

citation_content = download_citation(link, title, snippet)
print("Citation content:", citation_content)
if isinstance(citation_content, str): # Case when citation_content is a citation file
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file.write(citation_content)
temp_file.flush()
added_item = zot.add_items(temp_file.name)
os.unlink(temp_file.name)
elif isinstance(citation_content, dict): # Case when citation_content is a dictionary
added_item = zot.create_items([citation_content])
else:
continue

# Get the full journal name from the added item
if isinstance(added_item, list) and len(added_item) > 0:
added_item_data = added_item[0]['data']
elif isinstance(added_item, dict) and 'data' in added_item:
added_item_data = added_item['data']

else:
continue
print("Added item to Zotero:", added_item_data)
full_journal_name = added_item_data.get('publicationTitle')

# If a full journal name is found, try to get the abbreviated version
if full_journal_name:
abbreviated_journal_name = get_abbreviated_journal_name(full_journal_name)

# Update the Zotero item with the abbreviated journal name, if found
if abbreviated_journal_name:
added_item[0]['data']['journalAbbreviation'] = abbreviated_journal_name
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])

# Format the title according to the given criteria
formatted_title = format_title(title)
if formatted_title != title:
added_item[0]['data']['title'] = formatted_title
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])

# NEW CODE: Add the item to the specified collection
zot.addto_collection(target_collection["key"], added_item[0]["key"])
print(f"Added item {added_item[0]['key']} to collection {target_collection['key']}")
if page < total_pages - 1:
time.sleep(delay_seconds)

"

adamsmith · May 13, 2023

a) it looks like you posted your actual write-enable Zotero API key above. I haven't tested that, but if that's the case, you should delete that key from Zotero
b) ChatGPT isn't well suited to write code that doesn't have much in terms of templates on the web. E.g., in this case, it invents a pyzotero method "add_items" that doesn't exist and it calls create_items, which requires the data to be in a very specific format, without any attempt to turn it into that format. See here: https://pyzotero.readthedocs.io/en/latest/#creating-and-updating-items for the relevant documentation. This won't be a quick fix -- the data wrangling required is pretty significant still.

I don't know why you don't receive any errors, though -- this should crash and burn :)

Marcellobil · May 13, 2023

Hi, thank you for the warning! I deleted that key.
Yes, I am running it on Google Colab and it gives no error. So is there any other method, even with another citation manager tool, to perform this automated task?

emirsale · January 31, 2024

Hi @Marcellobil!
I believe the suggestions of @adamsmith might be a good point to go further in programmatically manipulating items of your library. Particularly, the link mentioned seems to be of a great use.