cirro.file_utils
1import os 2import random 3import time 4from pathlib import Path, PurePath 5from typing import List, Union 6 7from boto3.exceptions import S3UploadFailedError 8from botocore.exceptions import ConnectionError 9 10from cirro.clients import S3Client 11from cirro.models.file import DirectoryStatistics, File 12 13if os.name == 'nt': 14 import win32api 15 import win32con 16 17 18def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]: 19 """ 20 Filters a list of files by a glob pattern 21 22 Args: 23 files (Union[List[File], List[str]]): List of Files or file paths 24 pattern (str): Glob pattern (i.e., *.fastq) 25 26 Returns: 27 The filtered list of files 28 """ 29 def matches_glob(file: Union[File, str]): 30 return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern) 31 32 return [ 33 file for file in files 34 if matches_glob(file) 35 ] 36 37 38def _is_hidden_file(file_path: Path): 39 # Remove hidden files from listing, desktop.ini .DS_Store, etc. 40 if os.name == 'nt': 41 attributes = win32api.GetFileAttributes(str(file_path)) 42 return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM) 43 else: 44 return file_path.name.startswith('.') 45 46 47def get_files_in_directory( 48 directory: Union[str, Path], 49 include_hidden=False 50) -> List[str]: 51 """ 52 Returns a list of strings containing the relative path of 53 each file within the indicated directory. 54 55 Args: 56 directory (Union[str, Path]): The path to the directory 57 include_hidden (bool): include hidden files in the returned list 58 59 Returns: 60 List of files in the directory 61 """ 62 path = Path(directory).expanduser() 63 path_posix = str(path.as_posix()) 64 65 paths = [] 66 67 for file_path in path.rglob("*"): 68 if file_path.is_dir(): 69 continue 70 71 if not include_hidden and _is_hidden_file(file_path): 72 continue 73 74 if not file_path.exists(): 75 continue 76 77 str_file_path = str(file_path.as_posix()) 78 str_file_path = str_file_path.replace(f'{path_posix}/', "") 79 paths.append(str_file_path) 80 81 paths.sort() 82 return paths 83 84 85def get_files_stats(files: List[Path]) -> DirectoryStatistics: 86 """ 87 @private 88 """ 89 sizes = [f.stat().st_size for f in files] 90 total_size = sum(sizes) / float(1 << 30) 91 return { 92 'sizeFriendly': f'{total_size:,.3f} GB', 93 'size': total_size, 94 'numberOfFiles': len(sizes) 95 } 96 97 98def upload_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str, max_retries=10): 99 """ 100 @private 101 """ 102 for file in files: 103 key = f'{prefix}/{file}' 104 local_path = Path(directory, file) 105 success = False 106 107 # Retry up to max_retries times 108 for retry in range(max_retries): 109 110 # Try the upload 111 try: 112 s3_client.upload_file( 113 local_path=local_path, 114 bucket=bucket, 115 key=key 116 ) 117 118 success = True 119 120 # Catch the upload error 121 except (S3UploadFailedError, ConnectionError) as e: 122 delay = random.uniform(0, 60) + retry * 60 123 # Report the error 124 print(f"Encountered error:\n{str(e)}\n" 125 f"Retrying in {delay:.0f} seconds ({max_retries - (retry + 1)} attempts remaining)") 126 time.sleep(delay) 127 128 if success: 129 break 130 131 132def download_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str): 133 """ 134 @private 135 """ 136 for file in files: 137 key = f'{prefix}/{file}'.lstrip('/') 138 local_path = Path(directory, file) 139 local_path.parent.mkdir(parents=True, exist_ok=True) 140 141 s3_client.download_file(local_path=local_path, 142 bucket=bucket, 143 key=key)
def
filter_files_by_pattern( files: Union[List[cirro.models.file.File], List[str]], pattern: str) -> Union[List[cirro.models.file.File], List[str]]:
19def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]: 20 """ 21 Filters a list of files by a glob pattern 22 23 Args: 24 files (Union[List[File], List[str]]): List of Files or file paths 25 pattern (str): Glob pattern (i.e., *.fastq) 26 27 Returns: 28 The filtered list of files 29 """ 30 def matches_glob(file: Union[File, str]): 31 return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern) 32 33 return [ 34 file for file in files 35 if matches_glob(file) 36 ]
Filters a list of files by a glob pattern
Arguments:
- files (Union[List[File], List[str]]): List of Files or file paths
- pattern (str): Glob pattern (i.e., *.fastq)
Returns:
The filtered list of files
def
get_files_in_directory(directory: Union[str, pathlib.Path], include_hidden=False) -> List[str]:
48def get_files_in_directory( 49 directory: Union[str, Path], 50 include_hidden=False 51) -> List[str]: 52 """ 53 Returns a list of strings containing the relative path of 54 each file within the indicated directory. 55 56 Args: 57 directory (Union[str, Path]): The path to the directory 58 include_hidden (bool): include hidden files in the returned list 59 60 Returns: 61 List of files in the directory 62 """ 63 path = Path(directory).expanduser() 64 path_posix = str(path.as_posix()) 65 66 paths = [] 67 68 for file_path in path.rglob("*"): 69 if file_path.is_dir(): 70 continue 71 72 if not include_hidden and _is_hidden_file(file_path): 73 continue 74 75 if not file_path.exists(): 76 continue 77 78 str_file_path = str(file_path.as_posix()) 79 str_file_path = str_file_path.replace(f'{path_posix}/', "") 80 paths.append(str_file_path) 81 82 paths.sort() 83 return paths
Returns a list of strings containing the relative path of each file within the indicated directory.
Arguments:
- directory (Union[str, Path]): The path to the directory
- include_hidden (bool): include hidden files in the returned list
Returns:
List of files in the directory