cirro.file_utils
1import os 2import random 3import time 4from pathlib import Path, PurePath 5from typing import List, Union, Dict 6 7from boto3.exceptions import S3UploadFailedError 8from botocore.exceptions import ConnectionError 9 10from cirro.clients import S3Client 11from cirro.models.file import DirectoryStatistics, File, PathLike 12 13if os.name == 'nt': 14 import win32api 15 import win32con 16 17 18def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]: 19 """ 20 Filters a list of files by a glob pattern 21 22 Args: 23 files (Union[List[File], List[str]]): List of Files or file paths 24 pattern (str): Glob pattern (i.e., *.fastq) 25 26 Returns: 27 The filtered list of files 28 """ 29 def matches_glob(file: Union[File, str]): 30 return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern) 31 32 return [ 33 file for file in files 34 if matches_glob(file) 35 ] 36 37 38def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]: 39 """ 40 Generates a mapping of file paths "flattened" to their base name. 41 42 Example: data1/sample1.fastq.gz -> sample1.fastq.gz 43 44 Args: 45 files: List[PathLike]: List of file paths 46 47 Returns: 48 Dict[PathLike, str]: Mapping of file paths to their base name 49 """ 50 return { 51 file: Path(file).name for file in files 52 } 53 54 55def _is_hidden_file(file_path: Path): 56 # Remove hidden files from listing, desktop.ini .DS_Store, etc. 57 if os.name == 'nt': 58 attributes = win32api.GetFileAttributes(str(file_path)) 59 return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM) 60 else: 61 return file_path.name.startswith('.') 62 63 64def get_files_in_directory( 65 directory: Union[str, Path], 66 include_hidden=False 67) -> List[str]: 68 """ 69 Returns a list of strings containing the relative path of 70 each file within the indicated directory. 71 72 Args: 73 directory (Union[str, Path]): The path to the directory 74 include_hidden (bool): include hidden files in the returned list 75 76 Returns: 77 List of files in the directory 78 """ 79 path = Path(directory).expanduser() 80 path_posix = str(path.as_posix()) 81 82 paths = [] 83 84 for file_path in path.rglob("*"): 85 if file_path.is_dir(): 86 continue 87 88 if not include_hidden and _is_hidden_file(file_path): 89 continue 90 91 if not file_path.exists(): 92 continue 93 94 str_file_path = str(file_path.as_posix()) 95 str_file_path = str_file_path.replace(f'{path_posix}/', "") 96 paths.append(str_file_path) 97 98 paths.sort() 99 return paths 100 101 102def _bytes_to_human_readable(num_bytes: int) -> str: 103 for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']: 104 if num_bytes < 1000.0 or unit == 'PB': 105 break 106 num_bytes /= 1000.0 107 return f"{num_bytes:,.2f} {unit}" 108 109 110def get_files_stats(files: List[PathLike]) -> DirectoryStatistics: 111 """ 112 Returns information about the list of files provided, such as the total size and number of files. 113 """ 114 sizes = [f.stat().st_size for f in files] 115 total_size = sum(sizes) 116 return DirectoryStatistics( 117 size_friendly=_bytes_to_human_readable(total_size), 118 size=total_size, 119 number_of_files=len(sizes) 120 ) 121 122 123def upload_directory(directory: PathLike, 124 files: List[PathLike], 125 file_path_map: Dict[PathLike, str], 126 s3_client: S3Client, 127 bucket: str, 128 prefix: str, 129 max_retries=10): 130 """ 131 @private 132 133 Uploads a list of files from the specified directory 134 Args: 135 directory (str|Path): Path to directory 136 files (typing.List[str|Path]): List of paths to files within the directory 137 must be the same type as directory. 138 file_path_map (typing.Dict[str|Path, str]): Map of file paths from source to destination 139 s3_client (cirro.clients.S3Client): S3 client 140 bucket (str): S3 bucket 141 prefix (str): S3 prefix 142 max_retries (int): Number of retries 143 """ 144 # Ensure all files are of the same type as the directory 145 if not all(isinstance(file, type(directory)) for file in files): 146 raise ValueError("All files must be of the same type as the directory (str or Path)") 147 148 for file in files: 149 if isinstance(file, str): 150 file_path = Path(directory, file) 151 else: 152 file_path = file 153 154 # Check if is present in the file_path_map 155 # if it is, use the mapped value as the destination path 156 if file in file_path_map: 157 file_relative = file_path_map[file] 158 else: 159 file_relative = file_path.relative_to(directory).as_posix() 160 161 key = f'{prefix}/{file_relative}' 162 success = False 163 164 # Retry up to max_retries times 165 for retry in range(max_retries): 166 167 # Try the upload 168 try: 169 s3_client.upload_file( 170 file_path=file_path, 171 bucket=bucket, 172 key=key 173 ) 174 175 success = True 176 177 # Catch the upload error 178 except (S3UploadFailedError, ConnectionError) as e: 179 delay = random.uniform(0, 60) + retry * 60 180 # Report the error 181 print(f"Encountered error:\n{str(e)}\n" 182 f"Retrying in {delay:.0f} seconds ({max_retries - (retry + 1)} attempts remaining)") 183 time.sleep(delay) 184 185 if success: 186 break 187 188 189def download_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str): 190 """ 191 @private 192 """ 193 for file in files: 194 key = f'{prefix}/{file}'.lstrip('/') 195 local_path = Path(directory, file) 196 local_path.parent.mkdir(parents=True, exist_ok=True) 197 198 s3_client.download_file(local_path=local_path, 199 bucket=bucket, 200 key=key)
def
filter_files_by_pattern( files: Union[List[cirro.models.file.File], List[str]], pattern: str) -> Union[List[cirro.models.file.File], List[str]]:
19def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]: 20 """ 21 Filters a list of files by a glob pattern 22 23 Args: 24 files (Union[List[File], List[str]]): List of Files or file paths 25 pattern (str): Glob pattern (i.e., *.fastq) 26 27 Returns: 28 The filtered list of files 29 """ 30 def matches_glob(file: Union[File, str]): 31 return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern) 32 33 return [ 34 file for file in files 35 if matches_glob(file) 36 ]
Filters a list of files by a glob pattern
Arguments:
- files (Union[List[File], List[str]]): List of Files or file paths
- pattern (str): Glob pattern (i.e., *.fastq)
Returns:
The filtered list of files
def
generate_flattened_file_map(files: List[~PathLike]) -> Dict[~PathLike, str]:
39def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]: 40 """ 41 Generates a mapping of file paths "flattened" to their base name. 42 43 Example: data1/sample1.fastq.gz -> sample1.fastq.gz 44 45 Args: 46 files: List[PathLike]: List of file paths 47 48 Returns: 49 Dict[PathLike, str]: Mapping of file paths to their base name 50 """ 51 return { 52 file: Path(file).name for file in files 53 }
Generates a mapping of file paths "flattened" to their base name.
Example: data1/sample1.fastq.gz -> sample1.fastq.gz
Arguments:
- files: List[PathLike]: List of file paths
Returns:
Dict[PathLike, str]: Mapping of file paths to their base name
def
get_files_in_directory(directory: Union[str, pathlib.Path], include_hidden=False) -> List[str]:
65def get_files_in_directory( 66 directory: Union[str, Path], 67 include_hidden=False 68) -> List[str]: 69 """ 70 Returns a list of strings containing the relative path of 71 each file within the indicated directory. 72 73 Args: 74 directory (Union[str, Path]): The path to the directory 75 include_hidden (bool): include hidden files in the returned list 76 77 Returns: 78 List of files in the directory 79 """ 80 path = Path(directory).expanduser() 81 path_posix = str(path.as_posix()) 82 83 paths = [] 84 85 for file_path in path.rglob("*"): 86 if file_path.is_dir(): 87 continue 88 89 if not include_hidden and _is_hidden_file(file_path): 90 continue 91 92 if not file_path.exists(): 93 continue 94 95 str_file_path = str(file_path.as_posix()) 96 str_file_path = str_file_path.replace(f'{path_posix}/', "") 97 paths.append(str_file_path) 98 99 paths.sort() 100 return paths
Returns a list of strings containing the relative path of each file within the indicated directory.
Arguments:
- directory (Union[str, Path]): The path to the directory
- include_hidden (bool): include hidden files in the returned list
Returns:
List of files in the directory
111def get_files_stats(files: List[PathLike]) -> DirectoryStatistics: 112 """ 113 Returns information about the list of files provided, such as the total size and number of files. 114 """ 115 sizes = [f.stat().st_size for f in files] 116 total_size = sum(sizes) 117 return DirectoryStatistics( 118 size_friendly=_bytes_to_human_readable(total_size), 119 size=total_size, 120 number_of_files=len(sizes) 121 )
Returns information about the list of files provided, such as the total size and number of files.