Python code for web-scraping citations and saving them in Zotero
Hi,
I am building a code in Python with Chatgtp that web-scrapes publications from google scholar and saves the citation records in one specific folder of my Zotero. Basically, I want to automatize the process of: 1- sending a query to google scholar; 2- clicking on the link represented by the title of a publication (i.e., the link on the upper left of each record), and, by doing this, going to the webpage of the publication; 3- clicking on the extension/plug-in button of Zotero I have installed on my google chrome browser and, by doing this, saving in Zotero the citation record relative to the publication webpage I accessed at step 2. Despite multiple (displayed between quotation marks at the bottom of this message) modifications, the code cannot achieve the tasks abovementioned. Nor it's a problem of my Zotero account, API, permissions, client tools, or syncing: it's all right there. If anybody achieved to do something reflecting my intentions, please let me know!
Here, between quotation marks, follows the code I used. It can scrape citations from Scholar but is completely unable to save them in Zotero, and it doesn't return error messages.
"!pip install pyzotero
!pip install titlecase
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import requests
import time
from bs4 import BeautifulSoup
import re
import tempfile
import os
from pyzotero.zotero import Zotero
from titlecase import titlecase
from urllib.parse import urljoin
# Your Zotero API key and user ID
zotero_api_key = "yIphlkdQ1zfqbTfcjIuC2CnR"
zotero_user_id = "7793142"
# Connect to your Zotero library
zot = Zotero(zotero_user_id, "user", zotero_api_key)
# NEW CODE: Specify the collection name and find or create the collection
collection_name = "prova dal codice"
existing_collections = zot.collections(q=collection_name)
if existing_collections:
target_collection = existing_collections[0]
else:
target_collection = zot.create_collections([{"name": collection_name}])[0]
# END NEW CODE
print("Target collection name:", target_collection["data"]["name"])
print("Target collection key:", target_collection["key"])
api_key = "0e9cc7d3533e76bdc5de76c0c9ef7a586d71714d1893670116582a7777014384"
search_query = '("basal metabolic rate" OR BMR) ("body temperature" OR Tb) ("lower critical temperature" OR Tlc) (mammal OR bird OR avian) (species OR taxa)'
num_results_per_page = 10
total_pages = 2
delay_seconds = 5
def get_abbreviated_journal_name(full_name):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "nlmcatalog",
"term": f"{full_name}[Title]",
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()
if data['esearchresult']['count'] == '0':
return None
id_list = data['esearchresult']['idlist']
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
params = {
"db": "nlmcatalog",
"id": ",".join(id_list),
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()
for item in data['result'].values():
if 'Title' in item and item['Title'] == full_name:
if 'NlmUniqueID' in item:
return item['NlmUniqueID']
return None
def format_title(title):
title = titlecase(title)
# Capitalize the first word after a colon
title = re.sub(r":\s?(\w)", lambda m: f": {m.group(1).upper()}", title)
# Make sure the genus name is capitalized, and the specific epithet is lowercase
title = re.sub(r"([A-Z][a-z]+\s[a-z]+\s[a-z]+)", lambda m: f"{m.group(1).capitalize()} {m.group(1).split()[1].lower()}", title)
return title
def download_citation(url, title, snippet):
citation_formats = [".bib", ".ris", ".enw"]
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a", href=True):
href = link["href"]
if any(ext in href for ext in citation_formats):
citation_url = urljoin(url, href)
citation_response = requests.get(citation_url, verify=False)
return citation_response.text
# Fallback when no citation file is available
return {
"itemType": "journalArticle",
"title": title,
"url": url,
"abstractNote": snippet,
}
for page in range(total_pages):
start = page * num_results_per_page
params = {
"engine": "google_scholar",
"q": search_query,
"api_key": api_key,
"num": num_results_per_page,
"start": start
}
response = requests.get("https://serpapi.com/search", params=params)
results = response.json()
for result in results["organic_results"]:
title = result["title"]
if "link" not in result:
print(f"Link not found for title: {title}\n")
continue
link = result["link"]
snippet = result["snippet"]
print(f"Title: {title}\nLink: {link}\nSnippet: {snippet}\n")
citation_content = download_citation(link, title, snippet)
print("Citation content:", citation_content)
if isinstance(citation_content, str): # Case when citation_content is a citation file
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file.write(citation_content)
temp_file.flush()
added_item = zot.add_items(temp_file.name)
os.unlink(temp_file.name)
elif isinstance(citation_content, dict): # Case when citation_content is a dictionary
added_item = zot.create_items([citation_content])
else:
continue
# Get the full journal name from the added item
if isinstance(added_item, list) and len(added_item) > 0:
added_item_data = added_item[0]['data']
elif isinstance(added_item, dict) and 'data' in added_item:
added_item_data = added_item['data']
else:
continue
print("Added item to Zotero:", added_item_data)
full_journal_name = added_item_data.get('publicationTitle')
# If a full journal name is found, try to get the abbreviated version
if full_journal_name:
abbreviated_journal_name = get_abbreviated_journal_name(full_journal_name)
# Update the Zotero item with the abbreviated journal name, if found
if abbreviated_journal_name:
added_item[0]['data']['journalAbbreviation'] = abbreviated_journal_name
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])
# Format the title according to the given criteria
formatted_title = format_title(title)
if formatted_title != title:
added_item[0]['data']['title'] = formatted_title
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])
# NEW CODE: Add the item to the specified collection
zot.addto_collection(target_collection["key"], added_item[0]["key"])
print(f"Added item {added_item[0]['key']} to collection {target_collection['key']}")
if page < total_pages - 1:
time.sleep(delay_seconds)
"
I am building a code in Python with Chatgtp that web-scrapes publications from google scholar and saves the citation records in one specific folder of my Zotero. Basically, I want to automatize the process of: 1- sending a query to google scholar; 2- clicking on the link represented by the title of a publication (i.e., the link on the upper left of each record), and, by doing this, going to the webpage of the publication; 3- clicking on the extension/plug-in button of Zotero I have installed on my google chrome browser and, by doing this, saving in Zotero the citation record relative to the publication webpage I accessed at step 2. Despite multiple (displayed between quotation marks at the bottom of this message) modifications, the code cannot achieve the tasks abovementioned. Nor it's a problem of my Zotero account, API, permissions, client tools, or syncing: it's all right there. If anybody achieved to do something reflecting my intentions, please let me know!
Here, between quotation marks, follows the code I used. It can scrape citations from Scholar but is completely unable to save them in Zotero, and it doesn't return error messages.
"!pip install pyzotero
!pip install titlecase
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import requests
import time
from bs4 import BeautifulSoup
import re
import tempfile
import os
from pyzotero.zotero import Zotero
from titlecase import titlecase
from urllib.parse import urljoin
# Your Zotero API key and user ID
zotero_api_key = "yIphlkdQ1zfqbTfcjIuC2CnR"
zotero_user_id = "7793142"
# Connect to your Zotero library
zot = Zotero(zotero_user_id, "user", zotero_api_key)
# NEW CODE: Specify the collection name and find or create the collection
collection_name = "prova dal codice"
existing_collections = zot.collections(q=collection_name)
if existing_collections:
target_collection = existing_collections[0]
else:
target_collection = zot.create_collections([{"name": collection_name}])[0]
# END NEW CODE
print("Target collection name:", target_collection["data"]["name"])
print("Target collection key:", target_collection["key"])
api_key = "0e9cc7d3533e76bdc5de76c0c9ef7a586d71714d1893670116582a7777014384"
search_query = '("basal metabolic rate" OR BMR) ("body temperature" OR Tb) ("lower critical temperature" OR Tlc) (mammal OR bird OR avian) (species OR taxa)'
num_results_per_page = 10
total_pages = 2
delay_seconds = 5
def get_abbreviated_journal_name(full_name):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "nlmcatalog",
"term": f"{full_name}[Title]",
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()
if data['esearchresult']['count'] == '0':
return None
id_list = data['esearchresult']['idlist']
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
params = {
"db": "nlmcatalog",
"id": ",".join(id_list),
"retmode": "json"
}
response = requests.get(base_url, params=params)
data = response.json()
for item in data['result'].values():
if 'Title' in item and item['Title'] == full_name:
if 'NlmUniqueID' in item:
return item['NlmUniqueID']
return None
def format_title(title):
title = titlecase(title)
# Capitalize the first word after a colon
title = re.sub(r":\s?(\w)", lambda m: f": {m.group(1).upper()}", title)
# Make sure the genus name is capitalized, and the specific epithet is lowercase
title = re.sub(r"([A-Z][a-z]+\s[a-z]+\s[a-z]+)", lambda m: f"{m.group(1).capitalize()} {m.group(1).split()[1].lower()}", title)
return title
def download_citation(url, title, snippet):
citation_formats = [".bib", ".ris", ".enw"]
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a", href=True):
href = link["href"]
if any(ext in href for ext in citation_formats):
citation_url = urljoin(url, href)
citation_response = requests.get(citation_url, verify=False)
return citation_response.text
# Fallback when no citation file is available
return {
"itemType": "journalArticle",
"title": title,
"url": url,
"abstractNote": snippet,
}
for page in range(total_pages):
start = page * num_results_per_page
params = {
"engine": "google_scholar",
"q": search_query,
"api_key": api_key,
"num": num_results_per_page,
"start": start
}
response = requests.get("https://serpapi.com/search", params=params)
results = response.json()
for result in results["organic_results"]:
title = result["title"]
if "link" not in result:
print(f"Link not found for title: {title}\n")
continue
link = result["link"]
snippet = result["snippet"]
print(f"Title: {title}\nLink: {link}\nSnippet: {snippet}\n")
citation_content = download_citation(link, title, snippet)
print("Citation content:", citation_content)
if isinstance(citation_content, str): # Case when citation_content is a citation file
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file.write(citation_content)
temp_file.flush()
added_item = zot.add_items(temp_file.name)
os.unlink(temp_file.name)
elif isinstance(citation_content, dict): # Case when citation_content is a dictionary
added_item = zot.create_items([citation_content])
else:
continue
# Get the full journal name from the added item
if isinstance(added_item, list) and len(added_item) > 0:
added_item_data = added_item[0]['data']
elif isinstance(added_item, dict) and 'data' in added_item:
added_item_data = added_item['data']
else:
continue
print("Added item to Zotero:", added_item_data)
full_journal_name = added_item_data.get('publicationTitle')
# If a full journal name is found, try to get the abbreviated version
if full_journal_name:
abbreviated_journal_name = get_abbreviated_journal_name(full_journal_name)
# Update the Zotero item with the abbreviated journal name, if found
if abbreviated_journal_name:
added_item[0]['data']['journalAbbreviation'] = abbreviated_journal_name
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])
# Format the title according to the given criteria
formatted_title = format_title(title)
if formatted_title != title:
added_item[0]['data']['title'] = formatted_title
time.sleep(1) # Add delay before updating Zotero item
zot.update_item(added_item[0])
# NEW CODE: Add the item to the specified collection
zot.addto_collection(target_collection["key"], added_item[0]["key"])
print(f"Added item {added_item[0]['key']} to collection {target_collection['key']}")
if page < total_pages - 1:
time.sleep(delay_seconds)
"
b) ChatGPT isn't well suited to write code that doesn't have much in terms of templates on the web. E.g., in this case, it invents a pyzotero method "add_items" that doesn't exist and it calls create_items, which requires the data to be in a very specific format, without any attempt to turn it into that format. See here: https://pyzotero.readthedocs.io/en/latest/#creating-and-updating-items for the relevant documentation. This won't be a quick fix -- the data wrangling required is pretty significant still.
I don't know why you don't receive any errors, though -- this should crash and burn :)
Yes, I am running it on Google Colab and it gives no error. So is there any other method, even with another citation manager tool, to perform this automated task?
I believe the suggestions of @adamsmith might be a good point to go further in programmatically manipulating items of your library. Particularly, the link mentioned seems to be of a great use.