Source code for driada.gdrive.download

import os
from os.path import join

from pydrive2.drive import GoogleDrive
import warnings
import wget
import gdown
import pandas as pd
from pathlib import Path
import requests

from .gdrive_utils import (
    parse_google_drive_file,
    id_from_link,
    client,
    folder_type,
    folders_url,
    MAX_NUMBER_FILES,
)
from ..utils.output import Capturing


def retrieve_relevant_ids(
    folder,
    name_part,
    prohibited_name_part="",
    whitelist=[],
    extensions=[".csv", ".xlsx"],
):
    """Retrieve file IDs and names from a Google Drive folder that match specified criteria.

    Recursively searches through a Google Drive folder and its subfolders to find files
    matching the given criteria. Files are selected based on name patterns, file extensions,
    and whitelist/blacklist rules.

    Parameters
    ----------
    folder : str
        URL of the Google Drive folder to search. Must be a valid Google Drive folder URL.
    name_part : str
        Substring that must be present in the file name for it to be included.
    prohibited_name_part : str, optional
        Substring that, if present in the file name, will exclude the file.
        Default is empty string (no exclusions).
    whitelist : list of str, optional
        List of exact file names that will be included regardless of other criteria.
        Default is empty list.
    extensions : list of str, optional
        List of allowed file extensions (e.g., ['.csv', '.xlsx']).
        If empty, all extensions are allowed. Default is ['.csv', '.xlsx'].

    Returns
    -------
    return_code : bool
        True if the operation completed successfully, False otherwise.
    relevant : list of tuple
        List of (file_id, file_name) tuples for files matching the criteria.

    Raises
    ------
    MemoryError
        If the folder contains more files than MAX_NUMBER_FILES (50).
    requests.RequestException
        If network request to Google Drive fails.

    Notes
    -----
    The function recursively searches through subfolders and applies the same
    filtering criteria to all levels of the folder hierarchy. Network errors
    during recursive searches are caught and reported but don't stop the
    overall operation."""

    return_code = True
    folder_page = client.get(folder)

    if folder_page.status_code != 200:
        return False, None

    gdrive_file, id_name_type_iter = parse_google_drive_file(
        folder,
        folder_page.text,
    )

    relevant = []
    # Convert iterator to list to check count and iterate
    id_name_type_list = list(id_name_type_iter)
    if len(id_name_type_list) > MAX_NUMBER_FILES:
        raise MemoryError(
            f"The folder {folder} has {len(id_name_type_list)} elements while max allowed number of files is {MAX_NUMBER_FILES}"
        )

    for child_id, child_name, child_type in id_name_type_list:
        if child_type != folder_type:
            if child_name in whitelist:
                relevant.append((child_id, child_name))
            elif name_part in child_name:
                if (
                    len(extensions) != 0
                    and Path(child_name).suffix in extensions
                    or len(extensions) == 0
                ):
                    if (
                        (prohibited_name_part is not None)
                        and (prohibited_name_part not in child_name)
                        or prohibited_name_part is None
                    ):
                        relevant.append((child_id, child_name))
            else:
                pass

        else:
            return_code, rel_sublist = retrieve_relevant_ids(
                folders_url + child_id,
                name_part,
                prohibited_name_part=prohibited_name_part,
                whitelist=whitelist,
                extensions=extensions,
            )
            if not return_code:
                print(f"recursive search broke on folder {child_id}")
                break
            relevant.extend(rel_sublist)

    return return_code, relevant



[docs]
def download_part_of_folder(
    output,  # path for downloaded data
    folder,  # share link to google drive folder
    key="",  # part of filename to search for
    antikey=None,  # part of name to suppress
    whitelist=[],  # list of filenames to be downloaded regardless of their names
    extensions=[".csv", ".xlsx", ".npz"],  # allowed file extensions
    via_pydrive=False,  # pydrive requires authorization, but can download a big number of files,
    gauth=None,
    maxfiles=None,
):
    """Download specific files from a Google Drive folder based on filtering criteria.

    Downloads files from a Google Drive folder that match specific name patterns and
    file extensions. Supports both gdown (no authentication) and PyDrive2 (requires
    authentication) methods.

    Parameters
    ----------
    output : str
        Local directory path where files will be downloaded. Directory will be
        created if it doesn't exist.
    folder : str
        Google Drive folder share link. Must be a valid Google Drive URL.
    key : str, optional
        Substring that must be present in file names to be downloaded.
        Default is empty string (matches all).
    antikey : str or None, optional
        Substring that, if present in file names, will exclude them from download.
        Default is None.
    whitelist : list of str, optional
        List of exact file names to download regardless of other criteria.
        Default is empty list.
    extensions : list of str, optional
        List of allowed file extensions. Default is ['.csv', '.xlsx', '.npz'].
    via_pydrive : bool, optional
        If True, use PyDrive2 (requires authentication but supports more files).
        If False, use gdown (no auth but limited). Default is False.
    gauth : GoogleAuth object or None, optional
        PyDrive2 authentication object. Required if via_pydrive=True.
        Default is None.
    maxfiles : int or None, optional
        Maximum number of files to download. Default is None (no limit).

    Returns
    -------
    return_code : bool
        True if download completed successfully, False otherwise.
    rel : list of tuple
        List of (file_id, file_name) tuples for downloaded files.
    load_log : list
        Captured output log from the download process.

    Raises
    ------
    ValueError
        If via_pydrive=True but gauth is None.
    FileNotFoundError
        If download fails when not using PyDrive2.
    OSError
        If unable to create output directory or write files.

    Notes
    -----
    When using PyDrive, all filtering parameters (antikey, whitelist, extensions)
    are applied consistently with the gdown path.

    Examples
    --------
    >>> # Download CSV files containing 'experiment' in name
    >>> success, files, log = download_part_of_folder(  # doctest: +SKIP
    ...     output='./data',
    ...     folder='https://drive.google.com/drive/folders/...',
    ...     key='experiment',
    ...     extensions=['.csv']
    ... )"""

    os.makedirs(output, exist_ok=True)

    with Capturing() as load_log:
        if via_pydrive:
            if gauth is None:
                raise ValueError(
                    "To use pydrive, you need to authenticate using one of the functions"
                    " in driada.gdrive.auth"
                )
            drive = GoogleDrive(gauth)

            rel = []
            fid = id_from_link(folder)
            file_list = drive.ListFile({"q": f"'{fid}' in parents and trashed=false"}).GetList()
            if maxfiles is not None:
                file_list = file_list[:maxfiles]

            for f in file_list:
                file_name = f["title"]
                file_ext = Path(file_name).suffix

                # Apply same filtering logic as gdown path
                should_download = False

                # Check whitelist first
                if file_name in whitelist:
                    should_download = True
                # Then check key match
                elif key in file_name:
                    # Check extensions
                    if not extensions or file_ext in extensions:
                        # Check antikey
                        if antikey is None or antikey not in file_name:
                            should_download = True

                if should_download:
                    f.GetContentFile(join(output, file_name))
                    rel.append((f["id"], file_name))

            return_code = True

        else:
            return_code, rel = retrieve_relevant_ids(
                folder,
                key,
                prohibited_name_part=antikey,
                whitelist=whitelist,
                extensions=extensions,
            )

            if return_code:
                for i, pair in enumerate(rel):
                    idx, name = rel[i]
                    gdown.download(id=idx, output=os.path.join(output, name))

            else:
                raise FileNotFoundError("Error in downloading procedure!")

        return return_code, rel, load_log




[docs]
def download_gdrive_data(
    data_router,
    expname,
    whitelist=["Timing.xlsx"],
    via_pydrive=False,
    data_pieces=None,
    tdir="DRIADA data",
    gauth=None,
):
    """Download experimental data from Google Drive based on a data router table or direct link.

    Uses a data router DataFrame to locate and download experimental data files
    from Google Drive folders specified for each experiment. Alternatively, can
    accept a direct Google Drive share link to download from a single folder.

    Parameters
    ----------
    data_router : pandas.DataFrame or str
        Either a DataFrame containing experiment names and corresponding Google Drive links
        for different data types (must have an 'Experiment' column), or a string containing
        a direct Google Drive share link to download from.
    expname : str
        Name of the experiment to download data for. Must match an entry in
        the 'Experiment' column of data_router if data_router is a DataFrame.
        Used as folder name and filename filter if data_router is a share link.
    whitelist : list of str, optional
        List of file names to always download regardless of naming patterns.
        Default is ['Timing.xlsx'].
    via_pydrive : bool, optional
        If True, use PyDrive2 for downloading (requires authentication).
        If False, use gdown. Default is False.
    data_pieces : list of str or None, optional
        List of data types (column names) to download. If None, downloads all
        available data types except certain excluded ones. Default is None.
        Ignored when data_router is a share link.
    tdir : str, optional
        Target directory name for downloaded data. Default is 'DRIADA data'.
    gauth : GoogleAuth object or None, optional
        PyDrive2 authentication object. Required if via_pydrive=True.
        Default is None.

    Returns
    -------
    success : bool
        True if at least one file was successfully downloaded, False otherwise.
    load_log : list
        Captured output log from the download process.

    Raises
    ------
    ValueError
        If data_router is not a DataFrame or string.
        If data_router is a DataFrame but lacks required 'Experiment' column.
        If via_pydrive=True but gauth is None.

    Notes
    -----
    When data_router is a DataFrame:
        The function creates a directory structure: tdir/expname/data_type/
        for organizing downloaded files. Data types excluded by default are:
        'Experiment', 'Description', 'Video', 'Aligned data', 'Computation results'.

    When data_router is a share link:
        The function creates a directory structure: tdir/expname/
        and downloads all files matching the expname filter.

    Empty directories are automatically removed after download attempts.

    Examples
    --------
    >>> # Using DataFrame router
    >>> success, log = download_gdrive_data(  # doctest: +SKIP
    ...     data_router=router_df,
    ...     expname='exp001'
    ... )

    >>> # Using direct share link
    >>> success, log = download_gdrive_data(  # doctest: +SKIP
    ...     data_router='https://drive.google.com/drive/folders/...',
    ...     expname='exp001'
    ... )"""

    # Validate inputs
    if isinstance(data_router, str):
        # Direct share link mode
        if via_pydrive and gauth is None:
            raise ValueError("gauth is required when via_pydrive=True")

        with Capturing() as load_log:
            print("-------------------------------------------------------------")
            print(f"Extracting data for {expname} from Google Drive share link")
            print("-------------------------------------------------------------")

            success = False
            output_dir = join(tdir, expname)
            os.makedirs(output_dir, exist_ok=True)

            # Download from share link
            return_code, rel, folder_log = download_part_of_folder(
                output_dir,
                data_router,  # share link
                key=expname,
                whitelist=whitelist,
                via_pydrive=via_pydrive,
                gauth=gauth,
            )

            load_log.extend(folder_log)

            if len(rel) == 0:
                try:
                    os.rmdir(output_dir)
                except OSError:
                    pass  # Directory not empty or other error
                print("No relevant data found at the provided link")
            else:
                loaded_names = [r[1] for r in rel]
                print(f"Downloaded {len(loaded_names)} files:")
                for n in loaded_names:
                    print(f"  - {n}")
                success = True

            return success, load_log

    elif isinstance(data_router, pd.DataFrame):
        # Original DataFrame mode
        if via_pydrive and gauth is None:
            raise ValueError("gauth is required when via_pydrive=True")

    with Capturing() as load_log:
        print("-------------------------------------------------------------")
        print(f"Extracting data for {expname} from Google Drive")
        print("-------------------------------------------------------------")

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DeprecationWarning)

            success = False
            # Support both English and Russian column names for backward compatibility
            exp_column = "Experiment" if "Experiment" in data_router.columns else "Эксперимент"
            desc_column = (
                "Description" if "Description" in data_router.columns else "Краткое описание"
            )

            if exp_column not in data_router.columns:
                raise ValueError(
                    f"data_router must have either 'Experiment' or 'Эксперимент' column"
                )

            available_exp = data_router[exp_column].values
            if expname not in available_exp:
                print(f"{expname} not found in available experiments: {available_exp}")
                return success, load_log

            row = data_router[data_router[exp_column] == expname]
            links = dict(zip(row.columns, row.values[0]))

            os.makedirs(join(tdir, expname), exist_ok=True)
            if data_pieces is None:
                data_pieces = [
                    d
                    for d in list(data_router.columns.values)
                    if d
                    not in [
                        exp_column,
                        desc_column,
                        "Video",
                        "Aligned data",
                        "Computation results",
                    ]
                ]

            for key in data_pieces:
                if "http" in links[key]:
                    print(f"Loading data: {key}...")
                    ddir = join(tdir, expname, key)
                    os.makedirs(ddir, exist_ok=True)
                    # gdown.download_folder(url = links[key], output = dir, quiet=False)
                    return_code, rel, folder_log = download_part_of_folder(
                        ddir,
                        links[key],
                        key=expname,
                        whitelist=whitelist,
                        via_pydrive=via_pydrive,
                        gauth=gauth,
                    )

                    load_log.extend(folder_log)

                    if len(rel) == 0:
                        os.rmdir(ddir)
                        print("No relevant data found at: ", links[key])

                    else:
                        loaded_names = [r[1] for r in rel]
                        for n in loaded_names:
                            print(n)
                        success = True

                    print("--------------------------")

            return success, load_log




[docs]
def initialize_iabs_router(root="/content", router_source=None):
    """Initialize the IABS data router from Google Sheets, URL, or DataFrame.

    Initializes the IABS (Institute for Advanced Brain Studies) data router
    from various sources: config file URL, direct Google Sheets URL, or
    pre-loaded DataFrame.

    Parameters
    ----------
    root : str, optional
        Root directory where the router file will be saved (if downloading).
        Default is '/content' (typically for Google Colab).
    router_source : str, pandas.DataFrame, or None, optional
        Source of the router data:
        - None: Downloads from URL in config.py (default behavior)
        - str: Direct Google Sheets export URL (e.g., 'https://docs.google.com/.../export?format=xlsx')
        - pandas.DataFrame: Pre-loaded router DataFrame with experiment data
        Default is None.

    Returns
    -------
    data_router : pandas.DataFrame
        DataFrame containing experiment information and Google Drive links.
        Columns include experiment names and various data type links.
    data_pieces : list of str
        List of data type column names that can be downloaded, excluding
        metadata columns.

    Raises
    ------
    ImportError
        If config.py not found or IABS_ROUTER_URL not defined in config.
    requests.RequestException
        If download from Google Sheets fails.
    pd.errors.ParserError
        If the downloaded file cannot be parsed as Excel.
    OSError
        If unable to create directory or write file.

    Notes
    -----
    Requires a config.py file with IABS_ROUTER_URL defined. See config_template.py
    for the required format.

    WARNING: This function removes any existing router file before downloading
    the latest version. No backup is created.

    Empty cells in the DataFrame are forward-filled to handle merged cells.

    The following columns are excluded from data_pieces as they contain
    metadata rather than downloadable data:
    - 'Experiment'
    - 'Description'
    - 'Video'
    - 'Aligned data'
    - 'Computation results'

    Examples
    --------
    >>> # Using config file URL (default)
    >>> router, pieces = initialize_iabs_router()  # doctest: +SKIP

    >>> # Using direct Google Sheets URL
    >>> url = "https://docs.google.com/spreadsheets/d/.../export?format=xlsx"
    >>> router, pieces = initialize_iabs_router(router_source=url)  # doctest: +SKIP

    >>> # Using pre-loaded DataFrame
    >>> df = pd.read_excel("my_router.xlsx")  # doctest: +SKIP
    >>> router, pieces = initialize_iabs_router(router_source=df)  # doctest: +SKIP
    """

    # Handle different router sources
    if isinstance(router_source, pd.DataFrame):
        # Use provided DataFrame directly
        data_router = router_source.copy()

    elif isinstance(router_source, str):
        # Download from provided URL
        router_name = "IABS data router.xlsx"
        router_path = join(root, router_name)
        os.makedirs(root, exist_ok=True)

        # Remove existing file if present
        if os.path.exists(router_path):
            os.remove(router_path)

        # Download from provided URL
        try:
            if router_source.endswith("/export?format=xlsx"):
                # Direct Google Sheets export URL
                data_router = pd.read_excel(router_source)
            else:
                # Regular download URL
                wget.download(router_source, out=router_path)
                data_router = pd.read_excel(router_path)
        except Exception as e:
            raise requests.RequestException(f"Failed to download/parse router from URL: {e}")

    else:
        # Default behavior: download from config URL
        router_name = "IABS data router.xlsx"
        router_path = join(root, router_name)
        os.makedirs(root, exist_ok=True)

        if router_name in os.listdir(root):
            os.remove(router_path)

        # Import URL from config
        try:
            from .config import IABS_ROUTER_URL
        except ImportError:
            raise ImportError(
                "config.py not found. Please copy config_template.py to config.py "
                "and set IABS_ROUTER_URL to your Google Sheets export URL. "
                "Make sure to add config.py to .gitignore."
            )
        except AttributeError:
            raise ImportError(
                "IABS_ROUTER_URL not found in config.py. Please check config_template.py "
                "for the required format."
            )

        # Download router file
        try:
            wget.download(IABS_ROUTER_URL, out=router_path)
        except Exception as e:
            raise requests.RequestException(f"Failed to download router file: {e}")

        try:
            data_router = pd.read_excel(router_path)
        except Exception as e:
            raise pd.errors.ParserError(f"Failed to parse router Excel file: {e}")

    # Process the DataFrame (applies to all sources)
    data_router = data_router.replace("", None).ffill()

    # Support both English and Russian column names for backward compatibility
    exp_column = "Experiment" if "Experiment" in data_router.columns else "Эксперимент"
    desc_column = "Description" if "Description" in data_router.columns else "Краткое описание"

    data_pieces = [
        d
        for d in list(data_router.columns.values)
        if d
        not in [
            exp_column,
            desc_column,
            "Video",
            "Aligned data",
            "Computation results",
        ]
    ]
    return data_router, data_pieces