cirro.file_utils

  1import os
  2import random
  3import time
  4from pathlib import Path, PurePath
  5from typing import List, Union, Dict
  6
  7from boto3.exceptions import S3UploadFailedError
  8from botocore.exceptions import ConnectionError
  9
 10from cirro.clients import S3Client
 11from cirro.models.file import DirectoryStatistics, File, PathLike
 12
 13if os.name == 'nt':
 14    import win32api
 15    import win32con
 16
 17
 18def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]:
 19    """
 20    Filters a list of files by a glob pattern
 21
 22    Args:
 23        files (Union[List[File], List[str]]): List of Files or file paths
 24        pattern (str): Glob pattern (i.e., *.fastq)
 25
 26    Returns:
 27        The filtered list of files
 28    """
 29    def matches_glob(file: Union[File, str]):
 30        return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern)
 31
 32    return [
 33        file for file in files
 34        if matches_glob(file)
 35    ]
 36
 37
 38def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]:
 39    """
 40    Generates a mapping of file paths "flattened" to their base name.
 41
 42    Example:  data1/sample1.fastq.gz -> sample1.fastq.gz
 43
 44    Args:
 45        files: List[PathLike]: List of file paths
 46
 47    Returns:
 48        Dict[PathLike, str]: Mapping of file paths to their base name
 49    """
 50    return {
 51        file: Path(file).name for file in files
 52    }
 53
 54
 55def _is_hidden_file(file_path: Path):
 56    # Remove hidden files from listing, desktop.ini .DS_Store, etc.
 57    if os.name == 'nt':
 58        attributes = win32api.GetFileAttributes(str(file_path))
 59        return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM)
 60    else:
 61        return file_path.name.startswith('.')
 62
 63
 64def get_files_in_directory(
 65    directory: Union[str, Path],
 66    include_hidden=False
 67) -> List[str]:
 68    """
 69    Returns a list of strings containing the relative path of
 70    each file within the indicated directory.
 71
 72    Args:
 73        directory (Union[str, Path]): The path to the directory
 74        include_hidden (bool): include hidden files in the returned list
 75
 76    Returns:
 77        List of files in the directory
 78    """
 79    path = Path(directory).expanduser()
 80    path_posix = str(path.as_posix())
 81
 82    paths = []
 83
 84    for file_path in path.rglob("*"):
 85        if file_path.is_dir():
 86            continue
 87
 88        if not include_hidden and _is_hidden_file(file_path):
 89            continue
 90
 91        if not file_path.exists():
 92            continue
 93
 94        str_file_path = str(file_path.as_posix())
 95        str_file_path = str_file_path.replace(f'{path_posix}/', "")
 96        paths.append(str_file_path)
 97
 98    paths.sort()
 99    return paths
100
101
102def _bytes_to_human_readable(num_bytes: int) -> str:
103    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
104        if num_bytes < 1000.0 or unit == 'PB':
105            break
106        num_bytes /= 1000.0
107    return f"{num_bytes:,.2f} {unit}"
108
109
110def get_files_stats(files: List[PathLike]) -> DirectoryStatistics:
111    """
112    Returns information about the list of files provided, such as the total size and number of files.
113    """
114    sizes = [f.stat().st_size for f in files]
115    total_size = sum(sizes)
116    return DirectoryStatistics(
117        size_friendly=_bytes_to_human_readable(total_size),
118        size=total_size,
119        number_of_files=len(sizes)
120    )
121
122
123def upload_directory(directory: PathLike,
124                     files: List[PathLike],
125                     file_path_map: Dict[PathLike, str],
126                     s3_client: S3Client,
127                     bucket: str,
128                     prefix: str,
129                     max_retries=10):
130    """
131    @private
132
133    Uploads a list of files from the specified directory
134    Args:
135        directory (str|Path): Path to directory
136        files (typing.List[str|Path]): List of paths to files within the directory
137            must be the same type as directory.
138        file_path_map (typing.Dict[str|Path, str]): Map of file paths from source to destination
139        s3_client (cirro.clients.S3Client): S3 client
140        bucket (str): S3 bucket
141        prefix (str): S3 prefix
142        max_retries (int): Number of retries
143    """
144    # Ensure all files are of the same type as the directory
145    if not all(isinstance(file, type(directory)) for file in files):
146        raise ValueError("All files must be of the same type as the directory (str or Path)")
147
148    for file in files:
149        if isinstance(file, str):
150            file_path = Path(directory, file)
151        else:
152            file_path = file
153
154        # Check if is present in the file_path_map
155        # if it is, use the mapped value as the destination path
156        if file in file_path_map:
157            file_relative = file_path_map[file]
158        else:
159            file_relative = file_path.relative_to(directory).as_posix()
160
161        key = f'{prefix}/{file_relative}'
162        success = False
163
164        # Retry up to max_retries times
165        for retry in range(max_retries):
166
167            # Try the upload
168            try:
169                s3_client.upload_file(
170                    file_path=file_path,
171                    bucket=bucket,
172                    key=key
173                )
174
175                success = True
176
177            # Catch the upload error
178            except (S3UploadFailedError, ConnectionError) as e:
179                delay = random.uniform(0, 60) + retry * 60
180                # Report the error
181                print(f"Encountered error:\n{str(e)}\n"
182                      f"Retrying in {delay:.0f} seconds ({max_retries - (retry + 1)} attempts remaining)")
183                time.sleep(delay)
184
185            if success:
186                break
187
188
189def download_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str):
190    """
191    @private
192    """
193    for file in files:
194        key = f'{prefix}/{file}'.lstrip('/')
195        local_path = Path(directory, file)
196        local_path.parent.mkdir(parents=True, exist_ok=True)
197
198        s3_client.download_file(local_path=local_path,
199                                bucket=bucket,
200                                key=key)
def filter_files_by_pattern( files: Union[List[cirro.models.file.File], List[str]], pattern: str) -> Union[List[cirro.models.file.File], List[str]]:
19def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]:
20    """
21    Filters a list of files by a glob pattern
22
23    Args:
24        files (Union[List[File], List[str]]): List of Files or file paths
25        pattern (str): Glob pattern (i.e., *.fastq)
26
27    Returns:
28        The filtered list of files
29    """
30    def matches_glob(file: Union[File, str]):
31        return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern)
32
33    return [
34        file for file in files
35        if matches_glob(file)
36    ]

Filters a list of files by a glob pattern

Arguments:
  • files (Union[List[File], List[str]]): List of Files or file paths
  • pattern (str): Glob pattern (i.e., *.fastq)
Returns:

The filtered list of files

def generate_flattened_file_map(files: List[~PathLike]) -> Dict[~PathLike, str]:
39def generate_flattened_file_map(files: List[PathLike]) -> Dict[PathLike, str]:
40    """
41    Generates a mapping of file paths "flattened" to their base name.
42
43    Example:  data1/sample1.fastq.gz -> sample1.fastq.gz
44
45    Args:
46        files: List[PathLike]: List of file paths
47
48    Returns:
49        Dict[PathLike, str]: Mapping of file paths to their base name
50    """
51    return {
52        file: Path(file).name for file in files
53    }

Generates a mapping of file paths "flattened" to their base name.

Example: data1/sample1.fastq.gz -> sample1.fastq.gz

Arguments:
  • files: List[PathLike]: List of file paths
Returns:

Dict[PathLike, str]: Mapping of file paths to their base name

def get_files_in_directory(directory: Union[str, pathlib.Path], include_hidden=False) -> List[str]:
 65def get_files_in_directory(
 66    directory: Union[str, Path],
 67    include_hidden=False
 68) -> List[str]:
 69    """
 70    Returns a list of strings containing the relative path of
 71    each file within the indicated directory.
 72
 73    Args:
 74        directory (Union[str, Path]): The path to the directory
 75        include_hidden (bool): include hidden files in the returned list
 76
 77    Returns:
 78        List of files in the directory
 79    """
 80    path = Path(directory).expanduser()
 81    path_posix = str(path.as_posix())
 82
 83    paths = []
 84
 85    for file_path in path.rglob("*"):
 86        if file_path.is_dir():
 87            continue
 88
 89        if not include_hidden and _is_hidden_file(file_path):
 90            continue
 91
 92        if not file_path.exists():
 93            continue
 94
 95        str_file_path = str(file_path.as_posix())
 96        str_file_path = str_file_path.replace(f'{path_posix}/', "")
 97        paths.append(str_file_path)
 98
 99    paths.sort()
100    return paths

Returns a list of strings containing the relative path of each file within the indicated directory.

Arguments:
  • directory (Union[str, Path]): The path to the directory
  • include_hidden (bool): include hidden files in the returned list
Returns:

List of files in the directory

def get_files_stats(files: List[~PathLike]) -> cirro.models.file.DirectoryStatistics:
111def get_files_stats(files: List[PathLike]) -> DirectoryStatistics:
112    """
113    Returns information about the list of files provided, such as the total size and number of files.
114    """
115    sizes = [f.stat().st_size for f in files]
116    total_size = sum(sizes)
117    return DirectoryStatistics(
118        size_friendly=_bytes_to_human_readable(total_size),
119        size=total_size,
120        number_of_files=len(sizes)
121    )

Returns information about the list of files provided, such as the total size and number of files.