Autobib
Note: After writing this post, I then got carried away turned it into a python package: https://github.com/GregoryAshton/easybib (I had to rename it because there is already an autobib).
This program will search for bibtex keys in all .tex files for a given directory.
The code assumes the .tex files are using either INSPIREs keys or NASA/ADS (i.e., the keys given by default by these websites).
It will then lookup bibtex entries using the source key and write it to a file.
Download the file and save as autobib.py then check
$ python autobib.py --help
for options on how to use it.
#!/usr/bin/env python3
import re
import requests
from pathlib import Path
import argparse
def extract_cite_keys(tex_file):
"""Extract all citation keys from a LaTeX file.
Returns a tuple of (keys, warnings) where keys is a list of valid citation keys
and warnings is a list of warning messages for invalid keys.
"""
with open(tex_file, "r", encoding="utf-8") as f:
content = f.read()
# Match all citation commands: \cite{}, \citep{}, \citet{}, \citealt{}, \citealp{},
# \citeauthor{}, \citeyear{}, \Citep{}, \Citet{}, etc.
# Also handles optional arguments like \citep[e.g.][]{key}
pattern = r"\\[Cc]ite[a-zA-Z]*(?:\[[^\]]*\])*\{([^}]+)\}"
matches = re.findall(pattern, content)
# Split multiple keys in single cite command
keys = []
warnings = []
for match in matches:
for key in match.split(","):
key = key.strip()
if not key:
warnings.append(f"{tex_file}: Empty citation key found")
elif ":" not in key:
warnings.append(f"{tex_file}: Skipping key '{key}' (not an INSPIRE/ADS key)")
else:
keys.append(key)
return keys, warnings
def get_inspire_bibtex(key):
"""Fetch BibTeX directly from INSPIRE for a given INSPIRE key."""
url = f"https://inspirehep.net/api/literature?q=texkeys:{key}"
headers = {"Accept": "application/x-bibtex"}
response = requests.get(url, headers=headers)
if response.status_code == 200 and response.text.strip():
return response.text.strip()
return None
def get_ads_info_from_inspire(key):
"""Fetch ADS bibcode and arXiv ID from INSPIRE for a given INSPIRE key.
Returns a tuple of (ads_bibcode, arxiv_id), either may be None.
"""
# Use texkeys field to avoid colon being interpreted as field operator
url = f"https://inspirehep.net/api/literature?q=texkeys:{key}"
headers = {"Accept": "application/json"}
response = requests.get(url, headers=headers)
ads_bibcode = None
arxiv_id = None
if response.status_code == 200:
data = response.json()
hits = data.get("hits", {}).get("hits", [])
if hits:
metadata = hits[0].get("metadata", {})
# Try to get ADS bibcode
external_ids = metadata.get("external_system_identifiers", [])
for ext_id in external_ids:
if ext_id.get("schema") == "ADS":
ads_bibcode = ext_id.get("value")
break
# Get arXiv ID as fallback
arxiv_eprints = metadata.get("arxiv_eprints", [])
if arxiv_eprints:
arxiv_id = arxiv_eprints[0].get("value")
return ads_bibcode, arxiv_id
def search_ads_by_arxiv(arxiv_id, api_key):
"""Search ADS for a paper by arXiv ID and return its bibcode."""
url = "https://api.adsabs.harvard.edu/v1/search/query"
headers = {"Authorization": f"Bearer {api_key}"}
params = {"q": f"arXiv:{arxiv_id}", "fl": "bibcode"}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
result = response.json()
docs = result.get("response", {}).get("docs", [])
if docs:
return docs[0].get("bibcode")
return None
def get_ads_bibtex(bibcode, api_key):
"""Fetch BibTeX from ADS for a given bibcode."""
url = "https://api.adsabs.harvard.edu/v1/export/bibtex"
headers = {"Authorization": f"Bearer {api_key}"}
data = {"bibcode": [bibcode]}
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
result = response.json()
export = result.get("export", "").strip()
if export and not export.startswith("No records"):
return export
return None
def extract_existing_bib_keys(bib_file):
"""Extract citation keys from an existing BibTeX file."""
if not bib_file.exists():
return set()
with open(bib_file, "r", encoding="utf-8") as f:
content = f.read()
# Match @type{key,
pattern = r"@\w+\s*\{\s*([^,\s]+)\s*,"
return set(re.findall(pattern, content))
def replace_bibtex_key(bibtex, new_key):
"""Replace the citation key in a BibTeX entry with a new key."""
# Match the entry type and key: @article{old_key,
pattern = r"(@\w+\s*\{)\s*([^,\s]+)\s*,"
return re.sub(pattern, rf"\1{new_key},", bibtex, count=1)
def truncate_authors(bibtex, max_authors):
"""Truncate the author list in a BibTeX entry to max_authors.
If there are more than max_authors, keep the first max_authors and add "and others".
If max_authors is None or 0, no truncation is performed.
"""
if not max_authors:
return bibtex
# Match the author field (handles multiline author fields)
author_pattern = r"(\s*author\s*=\s*\{)(.+?)(\},?\s*\n)"
match = re.search(author_pattern, bibtex, re.IGNORECASE | re.DOTALL)
if not match:
return bibtex
prefix = match.group(1)
authors_str = match.group(2)
suffix = match.group(3)
# Split authors by " and " (BibTeX standard separator)
authors = [a.strip() for a in re.split(r"\s+and\s+", authors_str)]
if len(authors) <= max_authors:
return bibtex
# Keep first max_authors and add "others"
truncated_authors = authors[:max_authors] + ["others"]
new_authors_str = " and ".join(truncated_authors)
# Replace the author field
new_author_field = f"{prefix}{new_authors_str}{suffix}"
return bibtex[: match.start()] + new_author_field + bibtex[match.end() :]
def is_ads_bibcode(key):
"""Check if a key looks like an ADS bibcode (e.g., 2016PhRvL.116f1102A)."""
# ADS bibcodes are typically 19 characters: 4-digit year + journal code + volume + page + author initial
# Pattern: YYYYJJJJJVVVVMPPPPA where Y=year, J=journal, V=volume, M=section, P=page, A=author
ads_pattern = r"^\d{4}[A-Za-z&.]+\..*[A-Z]$"
return bool(re.match(ads_pattern, key)) and len(key) >= 15
def is_inspire_key(key):
"""Check if a key looks like an INSPIRE texkey (e.g., Author:2020abc)."""
# INSPIRE keys are typically Author:YYYYxxx where xxx is 2-3 lowercase letters
inspire_pattern = r"^[A-Za-z][A-Za-z0-9-]+:\d{4}[a-z]{2,3}$"
return bool(re.match(inspire_pattern, key))
def fetch_bibtex_ads_preferred(key, api_key):
"""Fetch BibTeX preferring ADS, with INSPIRE as fallback."""
# First check if it's already an ADS bibcode
if is_ads_bibcode(key):
bibtex = get_ads_bibtex(key, api_key)
if bibtex:
return bibtex, "ADS (direct)"
# Try to get ADS bibcode or arXiv ID from INSPIRE
ads_bibcode, arxiv_id = get_ads_info_from_inspire(key)
# Try ADS bibcode first
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS via INSPIRE ({ads_bibcode})"
# Fall back to arXiv ID search on ADS
if arxiv_id:
ads_bibcode = search_ads_by_arxiv(arxiv_id, api_key)
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS via arXiv ({arxiv_id})"
# Try the key directly as ADS bibcode
bibtex = get_ads_bibtex(key, api_key)
if bibtex:
return bibtex, "ADS (direct fallback)"
# Final fallback: fetch BibTeX directly from INSPIRE
bibtex = get_inspire_bibtex(key)
if bibtex:
return bibtex, "INSPIRE (fallback)"
return None, None
def fetch_bibtex_inspire_preferred(key, api_key):
"""Fetch BibTeX preferring INSPIRE, with ADS as fallback."""
# Try INSPIRE first
bibtex = get_inspire_bibtex(key)
if bibtex:
return bibtex, "INSPIRE"
# Fall back to ADS
if is_ads_bibcode(key):
bibtex = get_ads_bibtex(key, api_key)
if bibtex:
return bibtex, "ADS (fallback, direct)"
# Try to get ADS bibcode from INSPIRE metadata
ads_bibcode, arxiv_id = get_ads_info_from_inspire(key)
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS (fallback, via INSPIRE)"
if arxiv_id:
ads_bibcode = search_ads_by_arxiv(arxiv_id, api_key)
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS (fallback, via arXiv)"
return None, None
def fetch_bibtex_auto(key, api_key):
"""Fetch BibTeX using the source that matches the key format."""
if is_ads_bibcode(key):
# Key looks like ADS bibcode, prefer ADS
bibtex = get_ads_bibtex(key, api_key)
if bibtex:
return bibtex, "ADS (auto)"
# Fallback to INSPIRE
bibtex = get_inspire_bibtex(key)
if bibtex:
return bibtex, "INSPIRE (fallback)"
else:
# Key looks like INSPIRE key, prefer INSPIRE
bibtex = get_inspire_bibtex(key)
if bibtex:
return bibtex, "INSPIRE (auto)"
# Fallback to ADS via INSPIRE cross-reference
ads_bibcode, arxiv_id = get_ads_info_from_inspire(key)
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS (fallback, via INSPIRE)"
if arxiv_id:
ads_bibcode = search_ads_by_arxiv(arxiv_id, api_key)
if ads_bibcode:
bibtex = get_ads_bibtex(ads_bibcode, api_key)
if bibtex:
return bibtex, f"ADS (fallback, via arXiv)"
return None, None
def fetch_bibtex(key, api_key, source="ads"):
"""Fetch BibTeX using the specified source preference."""
if source == "ads":
return fetch_bibtex_ads_preferred(key, api_key)
elif source == "inspire":
return fetch_bibtex_inspire_preferred(key, api_key)
elif source == "auto":
return fetch_bibtex_auto(key, api_key)
else:
return fetch_bibtex_ads_preferred(key, api_key)
def main():
import os
parser = argparse.ArgumentParser(
description="Extract citations and download BibTeX from NASA/ADS"
)
parser.add_argument("directory", help="Directory containing LaTeX files")
parser.add_argument(
"-o", "--output", default="references.bib", help="Output BibTeX file"
)
parser.add_argument(
"-a",
"--max-authors",
type=int,
default=3,
help="Maximum number of authors before truncating with 'and others' (default: 3, use 0 for no limit)",
)
parser.add_argument(
"-l",
"--list-keys",
action="store_true",
help="List citation keys found in LaTeX files and exit (no lookup)",
)
parser.add_argument(
"--fresh",
action="store_true",
help="Start from scratch, ignoring existing output file",
)
parser.add_argument(
"-s",
"--source",
choices=["ads", "inspire", "auto"],
default="ads",
help="Preferred BibTeX source: 'ads' (default), 'inspire', or 'auto' (based on key format)",
)
args = parser.parse_args()
# Collect all citation keys
tex_dir = Path(args.directory)
all_keys = set()
all_warnings = []
for tex_file in tex_dir.glob("**/*.tex"):
keys, warnings = extract_cite_keys(tex_file)
all_keys.update(keys)
all_warnings.extend(warnings)
# Print warnings for invalid keys
if all_warnings:
print("Warnings:")
for warning in all_warnings:
print(f" {warning}")
print()
print(f"Found {len(all_keys)} unique citation keys")
# If --list-keys, print keys and exit
if args.list_keys:
for key in sorted(all_keys):
print(key)
return 0
# Check for ADS API key (not required if using --source inspire)
api_key = os.getenv("ADS_API_KEY")
if not api_key and args.source != "inspire":
print("Error: ADS_API_KEY environment variable not set")
print("Get your API key from: https://ui.adsabs.harvard.edu/user/settings/token")
print("(Or use --source inspire to fetch from INSPIRE without an ADS key)")
return 1
# Check for existing bib file and determine which keys to fetch
output_path = Path(args.output)
existing_content = ""
if not args.fresh and output_path.exists():
existing_keys = extract_existing_bib_keys(output_path)
keys_to_fetch = all_keys - existing_keys
with open(output_path, "r", encoding="utf-8") as f:
existing_content = f.read().strip()
print(f"Found {len(existing_keys)} existing entries in {args.output}")
print(f"Fetching {len(keys_to_fetch)} new keys")
else:
keys_to_fetch = all_keys
if args.fresh and output_path.exists():
print(f"Starting fresh (ignoring existing {args.output})")
# Download BibTeX entries
bibtex_entries = []
not_found = []
for key in sorted(keys_to_fetch):
print(f"Fetching {key}...", end=" ")
bibtex, source = fetch_bibtex(key, api_key, args.source)
if bibtex:
bibtex = replace_bibtex_key(bibtex, key)
bibtex = truncate_authors(bibtex, args.max_authors)
bibtex_entries.append(bibtex)
print(f"✓ {source}")
else:
not_found.append(key)
print("✗ Not found")
# Write output (append new entries to existing content)
with open(args.output, "w", encoding="utf-8") as f:
if existing_content and bibtex_entries:
f.write(existing_content + "\n\n" + "\n\n".join(bibtex_entries))
elif existing_content:
f.write(existing_content)
else:
f.write("\n\n".join(bibtex_entries))
print(f"\nWrote {len(bibtex_entries)} new entries to {args.output}")
if not_found:
print(f"\nCould not find {len(not_found)} keys:")
for key in not_found:
print(f" - {key}")
if __name__ == "__main__":
main()