cirro.sdk.file
1import gzip 2from io import BytesIO, StringIO 3from typing import List 4 5import pandas as pd 6 7from typing import TYPE_CHECKING 8if TYPE_CHECKING: 9 import anndata 10 11from cirro.cirro_client import CirroApi 12from cirro.models.file import File 13from cirro.sdk.asset import DataPortalAssets, DataPortalAsset 14from cirro.sdk.exceptions import DataPortalInputError 15from cirro.utils import convert_size 16 17 18class DataPortalFile(DataPortalAsset): 19 """ 20 Datasets are made up of a collection of File objects in the Data Portal. 21 """ 22 23 def __init__(self, file: File, client: CirroApi): 24 """ 25 Instantiate by listing files from a dataset. 26 27 ```python 28 from cirro import DataPortal() 29 portal = DataPortal() 30 dataset = portal.get_dataset( 31 project="id-or-name-of-project", 32 dataset="id-or-name-of-dataset" 33 ) 34 files = dataset.list_files() 35 ``` 36 """ 37 # Attach the file object 38 self._file = file 39 self._client = client 40 41 # Note that the 'name' and 'id' attributes are set to the relative path 42 # The purpose of this is to support the DataPortalAssets class functions 43 @property 44 def id(self) -> str: 45 """Relative path of file within the dataset""" 46 return self._file.relative_path 47 48 @property 49 def name(self) -> str: 50 """Relative path of file within the dataset""" 51 return self._file.relative_path 52 53 @property 54 def file_name(self) -> str: 55 """Name of file, excluding the full folder path within the dataset""" 56 return self._file.name 57 58 @property 59 def relative_path(self) -> str: 60 """Relative path of file within the dataset""" 61 return self._file.relative_path 62 63 @property 64 def absolute_path(self) -> str: 65 """Fully URI to file object in AWS S3""" 66 return self._file.absolute_path 67 68 @property 69 def metadata(self) -> dict: 70 """File metadata""" 71 return self._file.metadata 72 73 @property 74 def size_bytes(self) -> int: 75 """File size (in bytes)""" 76 return self._file.size 77 78 @property 79 def size(self) -> str: 80 """File size converted to human-readable (e.g., 4.50 GB)""" 81 return convert_size(self._file.size) 82 83 def __str__(self): 84 return f"{self.relative_path} ({self.size})" 85 86 def _get(self) -> bytes: 87 """Internal method to call client.file.get_file""" 88 89 return self._client.file.get_file(self._file) 90 91 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 92 """ 93 Parse the file as a Pandas DataFrame. 94 95 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 96 97 File compression is inferred from the extension, but can be set 98 explicitly with the compression= flag. 99 100 All other keyword arguments are passed to pandas.read_csv 101 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 102 """ 103 104 if compression == 'infer': 105 # If the file appears to be compressed 106 if self.relative_path.endswith('.gz'): 107 compression = dict(method='gzip') 108 elif self.relative_path.endswith('.bz2'): 109 compression = dict(method='bz2') 110 elif self.relative_path.endswith('.xz'): 111 compression = dict(method='zstd') 112 elif self.relative_path.endswith('.zst'): 113 compression = dict(method='zstd') 114 else: 115 compression = None 116 117 if compression is not None: 118 handle = BytesIO(self._get()) 119 else: 120 handle = StringIO(self._get().decode(encoding)) 121 122 df = pd.read_csv( 123 handle, 124 compression=compression, 125 encoding=encoding, 126 **kwargs 127 ) 128 handle.close() 129 return df 130 131 def read_h5ad(self) -> 'anndata.AnnData': 132 """Read an AnnData object from a file.""" 133 # Import the anndata library, and raise an error if it is not available 134 try: 135 import anndata as ad # noqa 136 except ImportError: 137 raise ImportError("The anndata library is required to read AnnData files. " 138 "Please install it using 'pip install anndata'.") 139 140 # Download the file to a temporary file handle and parse the contents 141 with BytesIO(self._get()) as handle: 142 return ad.read_h5ad(handle) 143 144 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 145 """Read the file contents as a list of lines.""" 146 147 return self.read( 148 encoding=encoding, 149 compression=compression 150 ).splitlines() 151 152 def read(self, encoding='utf-8', compression=None) -> str: 153 """Read the file contents as text.""" 154 155 # Get the raw file contents 156 cont = self._get() 157 158 # If the file is uncompressed 159 if compression is None: 160 return cont.decode(encoding) 161 # If the file is compressed 162 else: 163 164 # Only gzip-compression is supported currently 165 if compression != "gzip": 166 raise DataPortalInputError("compression may be 'gzip' or None") 167 168 with gzip.open( 169 BytesIO( 170 cont 171 ), 172 'rt', 173 encoding=encoding 174 ) as handle: 175 return handle.read() 176 177 def download(self, download_location: str = None): 178 """Download the file to a local directory.""" 179 180 if download_location is None: 181 raise DataPortalInputError("Must provide download location") 182 183 self._client.file.download_files( 184 self._file.access_context, 185 download_location, 186 [self.relative_path] 187 ) 188 189 190class DataPortalFiles(DataPortalAssets[DataPortalFile]): 191 """Collection of DataPortalFile objects.""" 192 asset_name = "file" 193 194 def download(self, download_location: str = None) -> None: 195 """Download the collection of files to a local directory.""" 196 197 for f in self: 198 f.download(download_location)
19class DataPortalFile(DataPortalAsset): 20 """ 21 Datasets are made up of a collection of File objects in the Data Portal. 22 """ 23 24 def __init__(self, file: File, client: CirroApi): 25 """ 26 Instantiate by listing files from a dataset. 27 28 ```python 29 from cirro import DataPortal() 30 portal = DataPortal() 31 dataset = portal.get_dataset( 32 project="id-or-name-of-project", 33 dataset="id-or-name-of-dataset" 34 ) 35 files = dataset.list_files() 36 ``` 37 """ 38 # Attach the file object 39 self._file = file 40 self._client = client 41 42 # Note that the 'name' and 'id' attributes are set to the relative path 43 # The purpose of this is to support the DataPortalAssets class functions 44 @property 45 def id(self) -> str: 46 """Relative path of file within the dataset""" 47 return self._file.relative_path 48 49 @property 50 def name(self) -> str: 51 """Relative path of file within the dataset""" 52 return self._file.relative_path 53 54 @property 55 def file_name(self) -> str: 56 """Name of file, excluding the full folder path within the dataset""" 57 return self._file.name 58 59 @property 60 def relative_path(self) -> str: 61 """Relative path of file within the dataset""" 62 return self._file.relative_path 63 64 @property 65 def absolute_path(self) -> str: 66 """Fully URI to file object in AWS S3""" 67 return self._file.absolute_path 68 69 @property 70 def metadata(self) -> dict: 71 """File metadata""" 72 return self._file.metadata 73 74 @property 75 def size_bytes(self) -> int: 76 """File size (in bytes)""" 77 return self._file.size 78 79 @property 80 def size(self) -> str: 81 """File size converted to human-readable (e.g., 4.50 GB)""" 82 return convert_size(self._file.size) 83 84 def __str__(self): 85 return f"{self.relative_path} ({self.size})" 86 87 def _get(self) -> bytes: 88 """Internal method to call client.file.get_file""" 89 90 return self._client.file.get_file(self._file) 91 92 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 93 """ 94 Parse the file as a Pandas DataFrame. 95 96 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 97 98 File compression is inferred from the extension, but can be set 99 explicitly with the compression= flag. 100 101 All other keyword arguments are passed to pandas.read_csv 102 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 103 """ 104 105 if compression == 'infer': 106 # If the file appears to be compressed 107 if self.relative_path.endswith('.gz'): 108 compression = dict(method='gzip') 109 elif self.relative_path.endswith('.bz2'): 110 compression = dict(method='bz2') 111 elif self.relative_path.endswith('.xz'): 112 compression = dict(method='zstd') 113 elif self.relative_path.endswith('.zst'): 114 compression = dict(method='zstd') 115 else: 116 compression = None 117 118 if compression is not None: 119 handle = BytesIO(self._get()) 120 else: 121 handle = StringIO(self._get().decode(encoding)) 122 123 df = pd.read_csv( 124 handle, 125 compression=compression, 126 encoding=encoding, 127 **kwargs 128 ) 129 handle.close() 130 return df 131 132 def read_h5ad(self) -> 'anndata.AnnData': 133 """Read an AnnData object from a file.""" 134 # Import the anndata library, and raise an error if it is not available 135 try: 136 import anndata as ad # noqa 137 except ImportError: 138 raise ImportError("The anndata library is required to read AnnData files. " 139 "Please install it using 'pip install anndata'.") 140 141 # Download the file to a temporary file handle and parse the contents 142 with BytesIO(self._get()) as handle: 143 return ad.read_h5ad(handle) 144 145 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 146 """Read the file contents as a list of lines.""" 147 148 return self.read( 149 encoding=encoding, 150 compression=compression 151 ).splitlines() 152 153 def read(self, encoding='utf-8', compression=None) -> str: 154 """Read the file contents as text.""" 155 156 # Get the raw file contents 157 cont = self._get() 158 159 # If the file is uncompressed 160 if compression is None: 161 return cont.decode(encoding) 162 # If the file is compressed 163 else: 164 165 # Only gzip-compression is supported currently 166 if compression != "gzip": 167 raise DataPortalInputError("compression may be 'gzip' or None") 168 169 with gzip.open( 170 BytesIO( 171 cont 172 ), 173 'rt', 174 encoding=encoding 175 ) as handle: 176 return handle.read() 177 178 def download(self, download_location: str = None): 179 """Download the file to a local directory.""" 180 181 if download_location is None: 182 raise DataPortalInputError("Must provide download location") 183 184 self._client.file.download_files( 185 self._file.access_context, 186 download_location, 187 [self.relative_path] 188 )
Datasets are made up of a collection of File objects in the Data Portal.
24 def __init__(self, file: File, client: CirroApi): 25 """ 26 Instantiate by listing files from a dataset. 27 28 ```python 29 from cirro import DataPortal() 30 portal = DataPortal() 31 dataset = portal.get_dataset( 32 project="id-or-name-of-project", 33 dataset="id-or-name-of-dataset" 34 ) 35 files = dataset.list_files() 36 ``` 37 """ 38 # Attach the file object 39 self._file = file 40 self._client = client
Instantiate by listing files from a dataset.
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
files = dataset.list_files()
44 @property 45 def id(self) -> str: 46 """Relative path of file within the dataset""" 47 return self._file.relative_path
Relative path of file within the dataset
49 @property 50 def name(self) -> str: 51 """Relative path of file within the dataset""" 52 return self._file.relative_path
Relative path of file within the dataset
54 @property 55 def file_name(self) -> str: 56 """Name of file, excluding the full folder path within the dataset""" 57 return self._file.name
Name of file, excluding the full folder path within the dataset
59 @property 60 def relative_path(self) -> str: 61 """Relative path of file within the dataset""" 62 return self._file.relative_path
Relative path of file within the dataset
64 @property 65 def absolute_path(self) -> str: 66 """Fully URI to file object in AWS S3""" 67 return self._file.absolute_path
Fully URI to file object in AWS S3
74 @property 75 def size_bytes(self) -> int: 76 """File size (in bytes)""" 77 return self._file.size
File size (in bytes)
79 @property 80 def size(self) -> str: 81 """File size converted to human-readable (e.g., 4.50 GB)""" 82 return convert_size(self._file.size)
File size converted to human-readable (e.g., 4.50 GB)
92 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 93 """ 94 Parse the file as a Pandas DataFrame. 95 96 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 97 98 File compression is inferred from the extension, but can be set 99 explicitly with the compression= flag. 100 101 All other keyword arguments are passed to pandas.read_csv 102 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 103 """ 104 105 if compression == 'infer': 106 # If the file appears to be compressed 107 if self.relative_path.endswith('.gz'): 108 compression = dict(method='gzip') 109 elif self.relative_path.endswith('.bz2'): 110 compression = dict(method='bz2') 111 elif self.relative_path.endswith('.xz'): 112 compression = dict(method='zstd') 113 elif self.relative_path.endswith('.zst'): 114 compression = dict(method='zstd') 115 else: 116 compression = None 117 118 if compression is not None: 119 handle = BytesIO(self._get()) 120 else: 121 handle = StringIO(self._get().decode(encoding)) 122 123 df = pd.read_csv( 124 handle, 125 compression=compression, 126 encoding=encoding, 127 **kwargs 128 ) 129 handle.close() 130 return df
Parse the file as a Pandas DataFrame.
The default field separator is a comma (for CSV), use sep='\t' for TSV.
File compression is inferred from the extension, but can be set explicitly with the compression= flag.
All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
132 def read_h5ad(self) -> 'anndata.AnnData': 133 """Read an AnnData object from a file.""" 134 # Import the anndata library, and raise an error if it is not available 135 try: 136 import anndata as ad # noqa 137 except ImportError: 138 raise ImportError("The anndata library is required to read AnnData files. " 139 "Please install it using 'pip install anndata'.") 140 141 # Download the file to a temporary file handle and parse the contents 142 with BytesIO(self._get()) as handle: 143 return ad.read_h5ad(handle)
Read an AnnData object from a file.
145 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 146 """Read the file contents as a list of lines.""" 147 148 return self.read( 149 encoding=encoding, 150 compression=compression 151 ).splitlines()
Read the file contents as a list of lines.
153 def read(self, encoding='utf-8', compression=None) -> str: 154 """Read the file contents as text.""" 155 156 # Get the raw file contents 157 cont = self._get() 158 159 # If the file is uncompressed 160 if compression is None: 161 return cont.decode(encoding) 162 # If the file is compressed 163 else: 164 165 # Only gzip-compression is supported currently 166 if compression != "gzip": 167 raise DataPortalInputError("compression may be 'gzip' or None") 168 169 with gzip.open( 170 BytesIO( 171 cont 172 ), 173 'rt', 174 encoding=encoding 175 ) as handle: 176 return handle.read()
Read the file contents as text.
178 def download(self, download_location: str = None): 179 """Download the file to a local directory.""" 180 181 if download_location is None: 182 raise DataPortalInputError("Must provide download location") 183 184 self._client.file.download_files( 185 self._file.access_context, 186 download_location, 187 [self.relative_path] 188 )
Download the file to a local directory.
191class DataPortalFiles(DataPortalAssets[DataPortalFile]): 192 """Collection of DataPortalFile objects.""" 193 asset_name = "file" 194 195 def download(self, download_location: str = None) -> None: 196 """Download the collection of files to a local directory.""" 197 198 for f in self: 199 f.download(download_location)
Collection of DataPortalFile objects.