Source code for authorship.readers.obo_sheet

"""Pandas dataframe reader."""

import logging
from typing import Union

import pandas as pd

from .base import Reader
from ..constants import get_obo_google_sheets_df, safe
from ..models import Author, Authorship, Institution

__all__ = [
    "OboSheetReader",
    "OboGoogleSheetReader",
]

logger = logging.getLogger(__name__)

EXAMPLE = "1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ"
COLUMNS = [
    "First name",
    "Middle (optional)",
    "Last name",
    "Author Position",
    "Author Confirm",
    "Email",
    "ORCID",
    "Wikidata",
    "Twitter",
    "Organization + location (will separate later)",
]


[docs]class OboSheetReader(Reader):
    """Read from a pandas dataframe."""

    def __init__(self, df: pd.DataFrame):
        """Initialize the sheet reader.

        :param df:
            The dataframe representing the authorship. Find an example of a
            spreadsheet following the appropriate format at one of:

            - SSSOM paper https://docs.google.com/spreadsheets/d/1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ
            - ODK paper https://docs.google.com/spreadsheets/d/1JMo1ZyytnJGXr7biYqxMzV7DIAzlfyum5jcrtetG8lI
            - CL paper https://docs.google.com/spreadsheets/d/1G9b6NOyUkMJUI2ZiCWoFSDdoKf1KWdfTwlDoTJ4zntM
              (broken since many authors missing email/orcid)
        """
        self.df = df

[docs]    def get_authorship(self) -> Authorship:
        """Get authors and institutions."""
        df = self.df
        org_sheet = df[df.columns[-2:]]
        org_sheet.columns = ["code", "institution"]
        code_to_institution: dict[int, Institution] = {
            int(code): get_org(text) for code, text in org_sheet[org_sheet["code"].notna()].values
        }

        author_sheet = df[df.columns[:-2]]
        author_sheet = author_sheet[author_sheet[author_sheet.columns[0]].notna()]
        authors = [get_author(row, code_to_institution) for _, row in author_sheet.iterrows()]
        return Authorship(authors=authors, institutions=list(code_to_institution.values()))


[docs]class OboGoogleSheetReader(OboSheetReader):
    """Read from google sheets.

    An example sheet that has the right template can be found at:
    https://docs.google.com/spreadsheets/d/1NfhibWHOKgV2glmgRdKMzHEzTCw2_dUq_t0Zq64cgeQ
    using ``skiprows=1``
    """

    def __init__(
        self,
        google_sheet: str,
        *,
        gid: Union[str, int] = 0,
        skiprows=None,
    ):
        """Initialize the sheet reader.

        :param google_sheet: The identifier of the google sheet
        :param gid: The sheet identifier (in case there are more than one)
        :param skiprows: Should rows be skipped?
        """
        df = get_obo_google_sheets_df(google_sheet, gid, skiprows=skiprows)
        super().__init__(df)


def get_org(text: str) -> Institution:
    """Get an institution."""
    return Institution(
        name=text,  # todo parse out address
    )


def get_author(row, code_to_institution: dict[int, Institution]) -> Author:
    """Get an author from a row."""
    first = row["First name"]
    last = row["Last name"]
    codes_str = row["Organization + location (will separate later)"]
    if pd.isna(codes_str):
        raise ValueError(f"no affiliations written for {first} {last}")
    codes = [int(key.strip()) for key in codes_str.split(",")]
    institutions = []
    for code in codes:
        if code not in code_to_institution:
            logger.warning("missing organization code %d for %s%s", code, first, last)
        else:
            institutions.append(code_to_institution[code])
    if not institutions:
        raise ValueError(f"no affiliation for {first} {last}")

    return Author(
        first=row["First name"],
        middle=safe(row["Middle (optional)"]),
        last=row["Last name"],
        email=safe(row["Email"]),
        orcid=row["ORCID"],
        wikidata=row["Wikidata"].removeprefix("https://www.wikidata.org/wiki/")
        if pd.notna(row["Wikidata"])
        else None,
        twitter=row["Twitter"].removeprefix("@") if pd.notna(row["Twitter"]) else None,
        institutions=institutions,
    )