cirro.sdk.file
1import gzip 2from io import BytesIO, StringIO 3from typing import List 4 5from typing import TYPE_CHECKING 6if TYPE_CHECKING: 7 import anndata 8 from pandas import DataFrame 9 10from cirro.cirro_client import CirroApi 11from cirro.models.file import File, PathLike 12from cirro.sdk.asset import DataPortalAssets, DataPortalAsset 13from cirro.sdk.exceptions import DataPortalInputError 14from cirro.utils import convert_size 15 16 17class DataPortalFile(DataPortalAsset): 18 """ 19 Datasets are made up of a collection of File objects in the Data Portal. 20 """ 21 22 def __init__(self, file: File, client: CirroApi): 23 """ 24 Instantiate by listing files from a dataset. 25 26 ```python 27 from cirro import DataPortal() 28 portal = DataPortal() 29 dataset = portal.get_dataset( 30 project="id-or-name-of-project", 31 dataset="id-or-name-of-dataset" 32 ) 33 files = dataset.list_files() 34 ``` 35 """ 36 # Attach the file object 37 self._file = file 38 self._client = client 39 40 # Note that the 'name' and 'id' attributes are set to the relative path 41 # The purpose of this is to support the DataPortalAssets class functions 42 @property 43 def id(self) -> str: 44 """Relative path of file within the dataset""" 45 return self._file.relative_path 46 47 @property 48 def name(self) -> str: 49 """Relative path of file within the dataset""" 50 return self._file.relative_path 51 52 @property 53 def file_name(self) -> str: 54 """Name of file, excluding the full folder path within the dataset""" 55 return self._file.name 56 57 @property 58 def relative_path(self) -> str: 59 """Relative path of file within the dataset""" 60 return self._file.relative_path 61 62 @property 63 def absolute_path(self) -> str: 64 """Fully URI to file object in AWS S3""" 65 return self._file.absolute_path 66 67 @property 68 def metadata(self) -> dict: 69 """File metadata""" 70 return self._file.metadata 71 72 @property 73 def size_bytes(self) -> int: 74 """File size (in bytes)""" 75 return self._file.size 76 77 @property 78 def size(self) -> str: 79 """File size converted to human-readable (e.g., 4.50 GB)""" 80 return convert_size(self._file.size) 81 82 def __str__(self): 83 return f"{self.relative_path} ({self.size})" 84 85 def _get(self) -> bytes: 86 """Internal method to call client.file.get_file""" 87 88 return self._client.file.get_file(self._file) 89 90 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame': 91 """ 92 Parse the file as a Pandas DataFrame. 93 94 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 95 96 File compression is inferred from the extension, but can be set 97 explicitly with the compression= flag. 98 99 All other keyword arguments are passed to pandas.read_csv 100 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 101 """ 102 import pandas 103 104 if compression == 'infer': 105 # If the file appears to be compressed 106 if self.relative_path.endswith('.gz'): 107 compression = dict(method='gzip') 108 elif self.relative_path.endswith('.bz2'): 109 compression = dict(method='bz2') 110 elif self.relative_path.endswith('.xz'): 111 compression = dict(method='zstd') 112 elif self.relative_path.endswith('.zst'): 113 compression = dict(method='zstd') 114 else: 115 compression = None 116 117 if compression is not None: 118 handle = BytesIO(self._get()) 119 else: 120 handle = StringIO(self._get().decode(encoding)) 121 122 df = pandas.read_csv( 123 handle, 124 compression=compression, 125 encoding=encoding, 126 **kwargs 127 ) 128 handle.close() 129 return df 130 131 def read_h5ad(self) -> 'anndata.AnnData': 132 """Read an AnnData object from a file.""" 133 # Import the anndata library, and raise an error if it is not available 134 try: 135 import anndata as ad # noqa 136 except ImportError: 137 raise ImportError("The anndata library is required to read AnnData files. " 138 "Please install it using 'pip install anndata'.") 139 140 # Download the file to a temporary file handle and parse the contents 141 with BytesIO(self._get()) as handle: 142 return ad.read_h5ad(handle) 143 144 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 145 """Read the file contents as a list of lines.""" 146 147 return self.read( 148 encoding=encoding, 149 compression=compression 150 ).splitlines() 151 152 def read(self, encoding='utf-8', compression=None) -> str: 153 """Read the file contents as text.""" 154 155 # Get the raw file contents 156 cont = self._get() 157 158 # If the file is uncompressed 159 if compression is None: 160 return cont.decode(encoding) 161 # If the file is compressed 162 else: 163 164 # Only gzip-compression is supported currently 165 if compression != "gzip": 166 raise DataPortalInputError("compression may be 'gzip' or None") 167 168 with gzip.open( 169 BytesIO( 170 cont 171 ), 172 'rt', 173 encoding=encoding 174 ) as handle: 175 return handle.read() 176 177 def download(self, download_location: str = None): 178 """Download the file to a local directory.""" 179 180 if download_location is None: 181 raise DataPortalInputError("Must provide download location") 182 183 self._client.file.download_files( 184 self._file.access_context, 185 download_location, 186 [self.relative_path] 187 ) 188 189 def validate(self, local_path: PathLike): 190 """ 191 Validate that the local file matches the remote file by comparing checksums. 192 193 Args: 194 local_path (PathLike): Path to the local file to validate 195 Raises: 196 ValueError: If checksums do not match 197 RuntimeWarning: If the remote checksum is not available or not supported 198 """ 199 self._client.file.validate_file(self._file, local_path) 200 201 def is_valid(self, local_path: PathLike) -> bool: 202 """ 203 Check if the local file matches the remote file by comparing checksums. 204 205 Args: 206 local_path (PathLike): Path to the local file to validate 207 Returns: 208 bool: True if the local file matches the remote file, False otherwise 209 Raises: 210 RuntimeWarning: If the remote checksum is not available or not supported 211 """ 212 if not local_path: 213 raise DataPortalInputError("Must provide local path to validate file") 214 215 try: 216 self.validate(local_path) 217 return True 218 except ValueError: 219 return False 220 221 222class DataPortalFiles(DataPortalAssets[DataPortalFile]): 223 """Collection of DataPortalFile objects.""" 224 asset_name = "file" 225 226 def download(self, download_location: str = None) -> None: 227 """Download the collection of files to a local directory.""" 228 229 for f in self: 230 f.download(download_location)
18class DataPortalFile(DataPortalAsset): 19 """ 20 Datasets are made up of a collection of File objects in the Data Portal. 21 """ 22 23 def __init__(self, file: File, client: CirroApi): 24 """ 25 Instantiate by listing files from a dataset. 26 27 ```python 28 from cirro import DataPortal() 29 portal = DataPortal() 30 dataset = portal.get_dataset( 31 project="id-or-name-of-project", 32 dataset="id-or-name-of-dataset" 33 ) 34 files = dataset.list_files() 35 ``` 36 """ 37 # Attach the file object 38 self._file = file 39 self._client = client 40 41 # Note that the 'name' and 'id' attributes are set to the relative path 42 # The purpose of this is to support the DataPortalAssets class functions 43 @property 44 def id(self) -> str: 45 """Relative path of file within the dataset""" 46 return self._file.relative_path 47 48 @property 49 def name(self) -> str: 50 """Relative path of file within the dataset""" 51 return self._file.relative_path 52 53 @property 54 def file_name(self) -> str: 55 """Name of file, excluding the full folder path within the dataset""" 56 return self._file.name 57 58 @property 59 def relative_path(self) -> str: 60 """Relative path of file within the dataset""" 61 return self._file.relative_path 62 63 @property 64 def absolute_path(self) -> str: 65 """Fully URI to file object in AWS S3""" 66 return self._file.absolute_path 67 68 @property 69 def metadata(self) -> dict: 70 """File metadata""" 71 return self._file.metadata 72 73 @property 74 def size_bytes(self) -> int: 75 """File size (in bytes)""" 76 return self._file.size 77 78 @property 79 def size(self) -> str: 80 """File size converted to human-readable (e.g., 4.50 GB)""" 81 return convert_size(self._file.size) 82 83 def __str__(self): 84 return f"{self.relative_path} ({self.size})" 85 86 def _get(self) -> bytes: 87 """Internal method to call client.file.get_file""" 88 89 return self._client.file.get_file(self._file) 90 91 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame': 92 """ 93 Parse the file as a Pandas DataFrame. 94 95 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 96 97 File compression is inferred from the extension, but can be set 98 explicitly with the compression= flag. 99 100 All other keyword arguments are passed to pandas.read_csv 101 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 102 """ 103 import pandas 104 105 if compression == 'infer': 106 # If the file appears to be compressed 107 if self.relative_path.endswith('.gz'): 108 compression = dict(method='gzip') 109 elif self.relative_path.endswith('.bz2'): 110 compression = dict(method='bz2') 111 elif self.relative_path.endswith('.xz'): 112 compression = dict(method='zstd') 113 elif self.relative_path.endswith('.zst'): 114 compression = dict(method='zstd') 115 else: 116 compression = None 117 118 if compression is not None: 119 handle = BytesIO(self._get()) 120 else: 121 handle = StringIO(self._get().decode(encoding)) 122 123 df = pandas.read_csv( 124 handle, 125 compression=compression, 126 encoding=encoding, 127 **kwargs 128 ) 129 handle.close() 130 return df 131 132 def read_h5ad(self) -> 'anndata.AnnData': 133 """Read an AnnData object from a file.""" 134 # Import the anndata library, and raise an error if it is not available 135 try: 136 import anndata as ad # noqa 137 except ImportError: 138 raise ImportError("The anndata library is required to read AnnData files. " 139 "Please install it using 'pip install anndata'.") 140 141 # Download the file to a temporary file handle and parse the contents 142 with BytesIO(self._get()) as handle: 143 return ad.read_h5ad(handle) 144 145 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 146 """Read the file contents as a list of lines.""" 147 148 return self.read( 149 encoding=encoding, 150 compression=compression 151 ).splitlines() 152 153 def read(self, encoding='utf-8', compression=None) -> str: 154 """Read the file contents as text.""" 155 156 # Get the raw file contents 157 cont = self._get() 158 159 # If the file is uncompressed 160 if compression is None: 161 return cont.decode(encoding) 162 # If the file is compressed 163 else: 164 165 # Only gzip-compression is supported currently 166 if compression != "gzip": 167 raise DataPortalInputError("compression may be 'gzip' or None") 168 169 with gzip.open( 170 BytesIO( 171 cont 172 ), 173 'rt', 174 encoding=encoding 175 ) as handle: 176 return handle.read() 177 178 def download(self, download_location: str = None): 179 """Download the file to a local directory.""" 180 181 if download_location is None: 182 raise DataPortalInputError("Must provide download location") 183 184 self._client.file.download_files( 185 self._file.access_context, 186 download_location, 187 [self.relative_path] 188 ) 189 190 def validate(self, local_path: PathLike): 191 """ 192 Validate that the local file matches the remote file by comparing checksums. 193 194 Args: 195 local_path (PathLike): Path to the local file to validate 196 Raises: 197 ValueError: If checksums do not match 198 RuntimeWarning: If the remote checksum is not available or not supported 199 """ 200 self._client.file.validate_file(self._file, local_path) 201 202 def is_valid(self, local_path: PathLike) -> bool: 203 """ 204 Check if the local file matches the remote file by comparing checksums. 205 206 Args: 207 local_path (PathLike): Path to the local file to validate 208 Returns: 209 bool: True if the local file matches the remote file, False otherwise 210 Raises: 211 RuntimeWarning: If the remote checksum is not available or not supported 212 """ 213 if not local_path: 214 raise DataPortalInputError("Must provide local path to validate file") 215 216 try: 217 self.validate(local_path) 218 return True 219 except ValueError: 220 return False
Datasets are made up of a collection of File objects in the Data Portal.
23 def __init__(self, file: File, client: CirroApi): 24 """ 25 Instantiate by listing files from a dataset. 26 27 ```python 28 from cirro import DataPortal() 29 portal = DataPortal() 30 dataset = portal.get_dataset( 31 project="id-or-name-of-project", 32 dataset="id-or-name-of-dataset" 33 ) 34 files = dataset.list_files() 35 ``` 36 """ 37 # Attach the file object 38 self._file = file 39 self._client = client
Instantiate by listing files from a dataset.
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
files = dataset.list_files()
43 @property 44 def id(self) -> str: 45 """Relative path of file within the dataset""" 46 return self._file.relative_path
Relative path of file within the dataset
48 @property 49 def name(self) -> str: 50 """Relative path of file within the dataset""" 51 return self._file.relative_path
Relative path of file within the dataset
53 @property 54 def file_name(self) -> str: 55 """Name of file, excluding the full folder path within the dataset""" 56 return self._file.name
Name of file, excluding the full folder path within the dataset
58 @property 59 def relative_path(self) -> str: 60 """Relative path of file within the dataset""" 61 return self._file.relative_path
Relative path of file within the dataset
63 @property 64 def absolute_path(self) -> str: 65 """Fully URI to file object in AWS S3""" 66 return self._file.absolute_path
Fully URI to file object in AWS S3
73 @property 74 def size_bytes(self) -> int: 75 """File size (in bytes)""" 76 return self._file.size
File size (in bytes)
78 @property 79 def size(self) -> str: 80 """File size converted to human-readable (e.g., 4.50 GB)""" 81 return convert_size(self._file.size)
File size converted to human-readable (e.g., 4.50 GB)
91 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame': 92 """ 93 Parse the file as a Pandas DataFrame. 94 95 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 96 97 File compression is inferred from the extension, but can be set 98 explicitly with the compression= flag. 99 100 All other keyword arguments are passed to pandas.read_csv 101 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 102 """ 103 import pandas 104 105 if compression == 'infer': 106 # If the file appears to be compressed 107 if self.relative_path.endswith('.gz'): 108 compression = dict(method='gzip') 109 elif self.relative_path.endswith('.bz2'): 110 compression = dict(method='bz2') 111 elif self.relative_path.endswith('.xz'): 112 compression = dict(method='zstd') 113 elif self.relative_path.endswith('.zst'): 114 compression = dict(method='zstd') 115 else: 116 compression = None 117 118 if compression is not None: 119 handle = BytesIO(self._get()) 120 else: 121 handle = StringIO(self._get().decode(encoding)) 122 123 df = pandas.read_csv( 124 handle, 125 compression=compression, 126 encoding=encoding, 127 **kwargs 128 ) 129 handle.close() 130 return df
Parse the file as a Pandas DataFrame.
The default field separator is a comma (for CSV), use sep='\t' for TSV.
File compression is inferred from the extension, but can be set explicitly with the compression= flag.
All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
132 def read_h5ad(self) -> 'anndata.AnnData': 133 """Read an AnnData object from a file.""" 134 # Import the anndata library, and raise an error if it is not available 135 try: 136 import anndata as ad # noqa 137 except ImportError: 138 raise ImportError("The anndata library is required to read AnnData files. " 139 "Please install it using 'pip install anndata'.") 140 141 # Download the file to a temporary file handle and parse the contents 142 with BytesIO(self._get()) as handle: 143 return ad.read_h5ad(handle)
Read an AnnData object from a file.
145 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 146 """Read the file contents as a list of lines.""" 147 148 return self.read( 149 encoding=encoding, 150 compression=compression 151 ).splitlines()
Read the file contents as a list of lines.
153 def read(self, encoding='utf-8', compression=None) -> str: 154 """Read the file contents as text.""" 155 156 # Get the raw file contents 157 cont = self._get() 158 159 # If the file is uncompressed 160 if compression is None: 161 return cont.decode(encoding) 162 # If the file is compressed 163 else: 164 165 # Only gzip-compression is supported currently 166 if compression != "gzip": 167 raise DataPortalInputError("compression may be 'gzip' or None") 168 169 with gzip.open( 170 BytesIO( 171 cont 172 ), 173 'rt', 174 encoding=encoding 175 ) as handle: 176 return handle.read()
Read the file contents as text.
178 def download(self, download_location: str = None): 179 """Download the file to a local directory.""" 180 181 if download_location is None: 182 raise DataPortalInputError("Must provide download location") 183 184 self._client.file.download_files( 185 self._file.access_context, 186 download_location, 187 [self.relative_path] 188 )
Download the file to a local directory.
190 def validate(self, local_path: PathLike): 191 """ 192 Validate that the local file matches the remote file by comparing checksums. 193 194 Args: 195 local_path (PathLike): Path to the local file to validate 196 Raises: 197 ValueError: If checksums do not match 198 RuntimeWarning: If the remote checksum is not available or not supported 199 """ 200 self._client.file.validate_file(self._file, local_path)
Validate that the local file matches the remote file by comparing checksums.
Arguments:
- local_path (PathLike): Path to the local file to validate
Raises:
- ValueError: If checksums do not match
- RuntimeWarning: If the remote checksum is not available or not supported
202 def is_valid(self, local_path: PathLike) -> bool: 203 """ 204 Check if the local file matches the remote file by comparing checksums. 205 206 Args: 207 local_path (PathLike): Path to the local file to validate 208 Returns: 209 bool: True if the local file matches the remote file, False otherwise 210 Raises: 211 RuntimeWarning: If the remote checksum is not available or not supported 212 """ 213 if not local_path: 214 raise DataPortalInputError("Must provide local path to validate file") 215 216 try: 217 self.validate(local_path) 218 return True 219 except ValueError: 220 return False
Check if the local file matches the remote file by comparing checksums.
Arguments:
- local_path (PathLike): Path to the local file to validate
Returns:
bool: True if the local file matches the remote file, False otherwise
Raises:
- RuntimeWarning: If the remote checksum is not available or not supported
223class DataPortalFiles(DataPortalAssets[DataPortalFile]): 224 """Collection of DataPortalFile objects.""" 225 asset_name = "file" 226 227 def download(self, download_location: str = None) -> None: 228 """Download the collection of files to a local directory.""" 229 230 for f in self: 231 f.download(download_location)
Collection of DataPortalFile objects.