cirro.file_utils

  1import os
  2import random
  3import time
  4from pathlib import Path, PurePath
  5from typing import List, Union
  6
  7from boto3.exceptions import S3UploadFailedError
  8from botocore.exceptions import ConnectionError
  9
 10from cirro.clients import S3Client
 11from cirro.models.file import DirectoryStatistics, File
 12
 13if os.name == 'nt':
 14    import win32api
 15    import win32con
 16
 17
 18def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]:
 19    """
 20    Filters a list of files by a glob pattern
 21
 22    Args:
 23        files (Union[List[File], List[str]]): List of Files or file paths
 24        pattern (str): Glob pattern (i.e., *.fastq)
 25
 26    Returns:
 27        The filtered list of files
 28    """
 29    def matches_glob(file: Union[File, str]):
 30        return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern)
 31
 32    return [
 33        file for file in files
 34        if matches_glob(file)
 35    ]
 36
 37
 38def _is_hidden_file(file_path: Path):
 39    # Remove hidden files from listing, desktop.ini .DS_Store, etc.
 40    if os.name == 'nt':
 41        attributes = win32api.GetFileAttributes(str(file_path))
 42        return attributes & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM)
 43    else:
 44        return file_path.name.startswith('.')
 45
 46
 47def get_files_in_directory(
 48    directory: Union[str, Path],
 49    include_hidden=False
 50) -> List[str]:
 51    """
 52    Returns a list of strings containing the relative path of
 53    each file within the indicated directory.
 54
 55    Args:
 56        directory (Union[str, Path]): The path to the directory
 57        include_hidden (bool): include hidden files in the returned list
 58
 59    Returns:
 60        List of files in the directory
 61    """
 62    path = Path(directory).expanduser()
 63    path_posix = str(path.as_posix())
 64
 65    paths = []
 66
 67    for file_path in path.rglob("*"):
 68        if file_path.is_dir():
 69            continue
 70
 71        if not include_hidden and _is_hidden_file(file_path):
 72            continue
 73
 74        if not file_path.exists():
 75            continue
 76
 77        str_file_path = str(file_path.as_posix())
 78        str_file_path = str_file_path.replace(f'{path_posix}/', "")
 79        paths.append(str_file_path)
 80
 81    paths.sort()
 82    return paths
 83
 84
 85def get_files_stats(files: List[Path]) -> DirectoryStatistics:
 86    """
 87    @private
 88    """
 89    sizes = [f.stat().st_size for f in files]
 90    total_size = sum(sizes) / float(1 << 30)
 91    return {
 92        'sizeFriendly': f'{total_size:,.3f} GB',
 93        'size': total_size,
 94        'numberOfFiles': len(sizes)
 95    }
 96
 97
 98def upload_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str, max_retries=10):
 99    """
100    @private
101    """
102    for file in files:
103        key = f'{prefix}/{file}'
104        local_path = Path(directory, file)
105        success = False
106
107        # Retry up to max_retries times
108        for retry in range(max_retries):
109
110            # Try the upload
111            try:
112                s3_client.upload_file(
113                    local_path=local_path,
114                    bucket=bucket,
115                    key=key
116                )
117
118                success = True
119
120            # Catch the upload error
121            except (S3UploadFailedError, ConnectionError) as e:
122                delay = random.uniform(0, 60) + retry * 60
123                # Report the error
124                print(f"Encountered error:\n{str(e)}\n"
125                      f"Retrying in {delay:.0f} seconds ({max_retries - (retry + 1)} attempts remaining)")
126                time.sleep(delay)
127
128            if success:
129                break
130
131
132def download_directory(directory: str, files: List[str], s3_client: S3Client, bucket: str, prefix: str):
133    """
134    @private
135    """
136    for file in files:
137        key = f'{prefix}/{file}'.lstrip('/')
138        local_path = Path(directory, file)
139        local_path.parent.mkdir(parents=True, exist_ok=True)
140
141        s3_client.download_file(local_path=local_path,
142                                bucket=bucket,
143                                key=key)
def filter_files_by_pattern( files: Union[List[cirro.models.file.File], List[str]], pattern: str) -> Union[List[cirro.models.file.File], List[str]]:
19def filter_files_by_pattern(files: Union[List[File], List[str]], pattern: str) -> Union[List[File], List[str]]:
20    """
21    Filters a list of files by a glob pattern
22
23    Args:
24        files (Union[List[File], List[str]]): List of Files or file paths
25        pattern (str): Glob pattern (i.e., *.fastq)
26
27    Returns:
28        The filtered list of files
29    """
30    def matches_glob(file: Union[File, str]):
31        return PurePath(file if isinstance(file, str) else file.relative_path).match(pattern)
32
33    return [
34        file for file in files
35        if matches_glob(file)
36    ]

Filters a list of files by a glob pattern

Arguments:
  • files (Union[List[File], List[str]]): List of Files or file paths
  • pattern (str): Glob pattern (i.e., *.fastq)
Returns:

The filtered list of files

def get_files_in_directory(directory: Union[str, pathlib.Path], include_hidden=False) -> List[str]:
48def get_files_in_directory(
49    directory: Union[str, Path],
50    include_hidden=False
51) -> List[str]:
52    """
53    Returns a list of strings containing the relative path of
54    each file within the indicated directory.
55
56    Args:
57        directory (Union[str, Path]): The path to the directory
58        include_hidden (bool): include hidden files in the returned list
59
60    Returns:
61        List of files in the directory
62    """
63    path = Path(directory).expanduser()
64    path_posix = str(path.as_posix())
65
66    paths = []
67
68    for file_path in path.rglob("*"):
69        if file_path.is_dir():
70            continue
71
72        if not include_hidden and _is_hidden_file(file_path):
73            continue
74
75        if not file_path.exists():
76            continue
77
78        str_file_path = str(file_path.as_posix())
79        str_file_path = str_file_path.replace(f'{path_posix}/', "")
80        paths.append(str_file_path)
81
82    paths.sort()
83    return paths

Returns a list of strings containing the relative path of each file within the indicated directory.

Arguments:
  • directory (Union[str, Path]): The path to the directory
  • include_hidden (bool): include hidden files in the returned list
Returns:

List of files in the directory