Source code for driada.gdrive.gdrive_utils

import requests
import regex
from bs4 import BeautifulSoup
from itertools import islice
import json



[docs]
class GoogleDriveFile(object):
    """Represent Google Drive file objects structure.

    Attributes
    ----------
    id: str
        Unique id, used to build the download URL.
    name: str
        Actual name, used as file name.
    type: str
        MIME type, or application/vnd.google-apps.folder if it is a folder
    children: List[GoogleDriveFile]
        If it is a directory, it contains the folder files/directories"""


[docs]
    def __init__(self, id, name, type, children=None):
        """Initialize GoogleDriveFile instance.

        Parameters
        ----------
        id : str
            Unique file or folder ID from Google Drive.
        name : str
            Display name of the file or folder.
        type : str
            MIME type of the file, or 'application/vnd.google-apps.folder' for folders.
        children : List[GoogleDriveFile], optional
            Child items if this is a folder. Default is empty list."""
        self.id = id
        self.name = name
        self.type = type
        self.children = children if children is not None else []



[docs]
    def is_folder(self):
        """Check if the GoogleDriveFile is a folder.

        Returns
        -------
        bool
            True if the file is a folder, False otherwise.

        Notes
        -----
        Uses the global folder_type constant for comparison."""
        return self.type == folder_type



[docs]
    def __repr__(self):
        """Return string representation of GoogleDriveFile.

        Returns
        -------
        str
            Formatted string showing all attributes including children.

        Notes
        -----
        May produce long output if there are many children."""
        template = "(id={id}, name={name}, type={type}, children={children})"
        return "GoogleDriveFile" + template.format(
            id=self.id,
            name=self.name,
            type=self.type,
            children=self.children,
        )





[docs]
def parse_google_drive_file(folder, content, use_cookies=True):
    """Extract information about the current page file and its children.

    Parameters
    ----------
    folder : str
        URL of the Google Drive folder.
        Must be of the format 'https://drive.google.com/drive/folders/{id}'.
    content : str
        Google Drive's raw HTML content.
    use_cookies : bool, optional
        Whether to clear cookies. Default is True.

    Returns
    -------
    gdrive_file : GoogleDriveFile
        Current GoogleDriveFile object with empty children list.
    id_name_type_iter : list
        List of tuples (id, name, type) for each child item.

    Raises
    ------
    RuntimeError
        If folder information cannot be extracted from HTML.

    Notes
    -----
    Parses JavaScript data embedded in Google Drive HTML.
    Expects specific HTML structure and may break with Google Drive updates."""
    folder_soup = BeautifulSoup(content, features="html.parser")

    if not use_cookies:
        client.cookies.clear()

    # finds the script tag with window['_DRIVE_ivd']
    encoded_data = None
    for script in folder_soup.select("script"):
        inner_html = script.decode_contents()

        if "_DRIVE_ivd" in inner_html:
            # first js string is _DRIVE_ivd, the second one is the encoded arr
            regex_iter = string_regex.finditer(inner_html)
            # get the second elem in the iter
            try:
                encoded_data = next(islice(regex_iter, 1, None)).group(1)
            except StopIteration:
                raise RuntimeError("Couldn't find the folder encoded JS string")
            break

    if encoded_data is None:
        raise RuntimeError(
            "Cannot retrieve the folder information from the link. "
            "You may need to change the permission to "
            "'Anyone with the link'."
        )

    # decodes the array and evaluates it as a python array
    decoded = encoded_data.encode("utf-8").decode("unicode_escape")
    folder_arr = json.loads(decoded)

    folder_contents = [] if folder_arr[0] is None else folder_arr[0]

    gdrive_file = GoogleDriveFile(
        id=folder.split("/")[-1],
        name=" - ".join(folder_soup.title.contents[0].split(" - ")[:-1]),
        type=folder_type,
    )

    id_name_type_iter = [
        (e[0], e[2].encode("raw_unicode_escape").decode("utf-8"), e[3]) for e in folder_contents
    ]

    return gdrive_file, id_name_type_iter




[docs]
def download_and_parse_google_drive_link(
    folder, quiet=False, use_cookies=True, remaining_ok=False, name_part=""
):
    """Get folder structure of Google Drive folder URL.

    Parameters
    ----------
    folder : str
        URL of the Google Drive folder.
        Must be of the format 'https://drive.google.com/drive/folders/{id}'.
    quiet : bool, optional
        Suppress terminal output. Default is False.
    use_cookies : bool, optional
        Flag to use cookies. Default is True.
    remaining_ok : bool, optional
        Allow processing if folder has ≥50 files (API limit).
        Default is False.
    name_part : str, optional
        Filter items by name substring. Default is empty string (no filter).

    Returns
    -------
    return_code : bool
        True if successful, False if failed (network error, permissions, etc.).
    gdrive_file : GoogleDriveFile or None
        Folder structure with nested children, or None if failed.

    Raises
    ------
    RuntimeError
        If folder has ≥50 files and remaining_ok is False.

    Notes
    -----
    Recursively processes subfolders. Limited to 50 items per folder
    due to Google Drive API restrictions."""
    return_code = True

    folder_page = client.get(folder)

    if folder_page.status_code != 200:
        return False, None

    gdrive_file, id_name_type_iter = parse_google_drive_file(
        folder,
        folder_page.text,
    )

    for child_id, child_name, child_type in id_name_type_iter:
        if name_part in child_name:
            if child_type != folder_type:
                if not quiet:
                    print(
                        "Processing file",
                        child_id,
                        child_name,
                    )
                gdrive_file.children.append(
                    GoogleDriveFile(
                        id=child_id,
                        name=child_name,
                        type=child_type,
                    )
                )
                if not return_code:
                    return return_code, None
                continue

            if not quiet:
                print(
                    "Retrieving folder",
                    child_id,
                    child_name,
                )
            return_code, child = download_and_parse_google_drive_link(
                folders_url + child_id,
                use_cookies=use_cookies,
                quiet=quiet,
            )
            if not return_code:
                return return_code, None
            gdrive_file.children.append(child)

    has_at_least_max_files = len(gdrive_file.children) == MAX_NUMBER_FILES
    if not remaining_ok and has_at_least_max_files:
        err_msg = " ".join(
            [
                "The gdrive folder with url: {url}".format(url=folder),
                "has at least {max} files,".format(max=MAX_NUMBER_FILES),
                "gdrive can't download more than this limit,",
                "if you are ok with this,",
                "please run again with --remaining-ok flag.",
            ]
        )
        raise RuntimeError(err_msg)
    return return_code, gdrive_file




[docs]
def id_from_link(link):
    """Extract the file or folder ID from a Google Drive URL.

    Parameters
    ----------
    link : str
        Google Drive URL containing the file or folder ID.
        Can be in format:
        - https://drive.google.com/drive/folders/{id}
        - https://drive.google.com/file/d/{id}/view
        - https://drive.google.com/open?id={id}

    Returns
    -------
    str
        The extracted file or folder ID.

    Raises
    ------
    ValueError
        If the link doesn't contain 'http'.

    Examples
    --------
    >>> id_from_link('https://drive.google.com/drive/folders/1a2b3c4d5e')
    '1a2b3c4d5e'
    >>> id_from_link('https://drive.google.com/open?id=xyz123')
    'xyz123'

    Notes
    -----
    Does not validate the extracted ID format. May return empty string
    or invalid IDs for malformed URLs."""
    if "http" not in link:
        raise ValueError("Wrong link format")

    if "id=" in link:
        return link.split("id=")[-1].split("&")[0]
    else:
        return link.split("folders/")[-1].split("?")[0]



folders_url = "https://drive.google.com/drive/folders/"
files_url = "https://drive.google.com/uc?id="
folder_type = "application/vnd.google-apps.folder"

string_regex = regex.compile(r"'((?:[^'\\]|\\.)*)'")
MAX_NUMBER_FILES = 50

client = requests.session()