Source code for driada.gdrive.gdrive_utils

import requests
import regex
from bs4 import BeautifulSoup
from itertools import islice
import json


[docs] class GoogleDriveFile(object): """Represent Google Drive file objects structure. Attributes ---------- id: str Unique id, used to build the download URL. name: str Actual name, used as file name. type: str MIME type, or application/vnd.google-apps.folder if it is a folder children: List[GoogleDriveFile] If it is a directory, it contains the folder files/directories"""
[docs] def __init__(self, id, name, type, children=None): """Initialize GoogleDriveFile instance. Parameters ---------- id : str Unique file or folder ID from Google Drive. name : str Display name of the file or folder. type : str MIME type of the file, or 'application/vnd.google-apps.folder' for folders. children : List[GoogleDriveFile], optional Child items if this is a folder. Default is empty list.""" self.id = id self.name = name self.type = type self.children = children if children is not None else []
[docs] def is_folder(self): """Check if the GoogleDriveFile is a folder. Returns ------- bool True if the file is a folder, False otherwise. Notes ----- Uses the global folder_type constant for comparison.""" return self.type == folder_type
[docs] def __repr__(self): """Return string representation of GoogleDriveFile. Returns ------- str Formatted string showing all attributes including children. Notes ----- May produce long output if there are many children.""" template = "(id={id}, name={name}, type={type}, children={children})" return "GoogleDriveFile" + template.format( id=self.id, name=self.name, type=self.type, children=self.children, )
[docs] def parse_google_drive_file(folder, content, use_cookies=True): """Extract information about the current page file and its children. Parameters ---------- folder : str URL of the Google Drive folder. Must be of the format 'https://drive.google.com/drive/folders/{id}'. content : str Google Drive's raw HTML content. use_cookies : bool, optional Whether to clear cookies. Default is True. Returns ------- gdrive_file : GoogleDriveFile Current GoogleDriveFile object with empty children list. id_name_type_iter : list List of tuples (id, name, type) for each child item. Raises ------ RuntimeError If folder information cannot be extracted from HTML. Notes ----- Parses JavaScript data embedded in Google Drive HTML. Expects specific HTML structure and may break with Google Drive updates.""" folder_soup = BeautifulSoup(content, features="html.parser") if not use_cookies: client.cookies.clear() # finds the script tag with window['_DRIVE_ivd'] encoded_data = None for script in folder_soup.select("script"): inner_html = script.decode_contents() if "_DRIVE_ivd" in inner_html: # first js string is _DRIVE_ivd, the second one is the encoded arr regex_iter = string_regex.finditer(inner_html) # get the second elem in the iter try: encoded_data = next(islice(regex_iter, 1, None)).group(1) except StopIteration: raise RuntimeError("Couldn't find the folder encoded JS string") break if encoded_data is None: raise RuntimeError( "Cannot retrieve the folder information from the link. " "You may need to change the permission to " "'Anyone with the link'." ) # decodes the array and evaluates it as a python array decoded = encoded_data.encode("utf-8").decode("unicode_escape") folder_arr = json.loads(decoded) folder_contents = [] if folder_arr[0] is None else folder_arr[0] gdrive_file = GoogleDriveFile( id=folder.split("/")[-1], name=" - ".join(folder_soup.title.contents[0].split(" - ")[:-1]), type=folder_type, ) id_name_type_iter = [ (e[0], e[2].encode("raw_unicode_escape").decode("utf-8"), e[3]) for e in folder_contents ] return gdrive_file, id_name_type_iter
folders_url = "https://drive.google.com/drive/folders/" files_url = "https://drive.google.com/uc?id=" folder_type = "application/vnd.google-apps.folder" string_regex = regex.compile(r"'((?:[^'\\]|\\.)*)'") MAX_NUMBER_FILES = 50 client = requests.session()