"""Pandas dataframe reader."""
import logging
from typing import Union
import pandas as pd
from .base import Reader
from ..constants import get_obo_google_sheets_df, safe
from ..models import Author, Authorship, Institution
__all__ = [
"OboSheetReader",
"OboGoogleSheetReader",
]
logger = logging.getLogger(__name__)
EXAMPLE = "1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ"
COLUMNS = [
"First name",
"Middle (optional)",
"Last name",
"Author Position",
"Author Confirm",
"Email",
"ORCID",
"Wikidata",
"Twitter",
"Organization + location (will separate later)",
]
[docs]class OboSheetReader(Reader):
"""Read from a pandas dataframe."""
def __init__(self, df: pd.DataFrame):
"""Initialize the sheet reader.
:param df:
The dataframe representing the authorship. Find an example of a
spreadsheet following the appropriate format at one of:
- SSSOM paper https://docs.google.com/spreadsheets/d/1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ
- ODK paper https://docs.google.com/spreadsheets/d/1JMo1ZyytnJGXr7biYqxMzV7DIAzlfyum5jcrtetG8lI
- CL paper https://docs.google.com/spreadsheets/d/1G9b6NOyUkMJUI2ZiCWoFSDdoKf1KWdfTwlDoTJ4zntM
(broken since many authors missing email/orcid)
"""
self.df = df
[docs] def get_authorship(self) -> Authorship:
"""Get authors and institutions."""
df = self.df
org_sheet = df[df.columns[-2:]]
org_sheet.columns = ["code", "institution"]
code_to_institution: dict[int, Institution] = {
int(code): get_org(text) for code, text in org_sheet[org_sheet["code"].notna()].values
}
author_sheet = df[df.columns[:-2]]
author_sheet = author_sheet[author_sheet[author_sheet.columns[0]].notna()]
authors = [get_author(row, code_to_institution) for _, row in author_sheet.iterrows()]
return Authorship(authors=authors, institutions=list(code_to_institution.values()))
[docs]class OboGoogleSheetReader(OboSheetReader):
"""Read from google sheets.
An example sheet that has the right template can be found at:
https://docs.google.com/spreadsheets/d/1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ
using ``skiprows=1``
"""
def __init__(
self,
google_sheet: str,
*,
gid: Union[str, int] = 0,
skiprows=None,
):
"""Initialize the sheet reader.
:param google_sheet: The identifier of the google sheet
:param gid: The sheet identifier (in case there are more than one)
:param skiprows: Should rows be skipped?
"""
df = get_obo_google_sheets_df(google_sheet, gid, skiprows=skiprows)
super().__init__(df)
def get_org(text: str) -> Institution:
"""Get an institution."""
return Institution(
name=text, # todo parse out address
)
def get_author(row, code_to_institution: dict[int, Institution]) -> Author:
"""Get an author from a row."""
first = row["First name"]
last = row["Last name"]
codes_str = row["Organization + location (will separate later)"]
if pd.isna(codes_str):
raise ValueError(f"no affiliations written for {first} {last}")
codes = [int(key.strip()) for key in codes_str.split(",")]
institutions = []
for code in codes:
if code not in code_to_institution:
logger.warning("missing organization code %d for %s%s", code, first, last)
else:
institutions.append(code_to_institution[code])
if not institutions:
raise ValueError(f"no affiliation for {first} {last}")
return Author(
first=row["First name"],
middle=safe(row["Middle (optional)"]),
last=row["Last name"],
email=safe(row["Email"]),
orcid=row["ORCID"],
wikidata=row["Wikidata"].removeprefix("https://www.wikidata.org/wiki/")
if pd.notna(row["Wikidata"])
else None,
twitter=row["Twitter"].removeprefix("@") if pd.notna(row["Twitter"]) else None,
institutions=institutions,
)