cirro.sdk.file

  1import gzip
  2from io import BytesIO, StringIO
  3from typing import List
  4
  5from typing import TYPE_CHECKING
  6if TYPE_CHECKING:
  7    import anndata
  8    from pandas import DataFrame
  9
 10from cirro.cirro_client import CirroApi
 11from cirro.models.file import File, PathLike
 12from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 13from cirro.sdk.exceptions import DataPortalInputError
 14from cirro.utils import convert_size
 15
 16
 17class DataPortalFile(DataPortalAsset):
 18    """
 19    Datasets are made up of a collection of File objects in the Data Portal.
 20    """
 21
 22    def __init__(self, file: File, client: CirroApi):
 23        """
 24        Instantiate by listing files from a dataset.
 25
 26        ```python
 27        from cirro import DataPortal()
 28        portal = DataPortal()
 29        dataset = portal.get_dataset(
 30            project="id-or-name-of-project",
 31            dataset="id-or-name-of-dataset"
 32        )
 33        files = dataset.list_files()
 34        ```
 35        """
 36        # Attach the file object
 37        self._file = file
 38        self._client = client
 39
 40    # Note that the 'name' and 'id' attributes are set to the relative path
 41    # The purpose of this is to support the DataPortalAssets class functions
 42    @property
 43    def id(self) -> str:
 44        """Relative path of file within the dataset"""
 45        return self._file.relative_path
 46
 47    @property
 48    def name(self) -> str:
 49        """Relative path of file within the dataset"""
 50        return self._file.relative_path
 51
 52    @property
 53    def file_name(self) -> str:
 54        """Name of file, excluding the full folder path within the dataset"""
 55        return self._file.name
 56
 57    @property
 58    def relative_path(self) -> str:
 59        """Relative path of file within the dataset"""
 60        return self._file.relative_path
 61
 62    @property
 63    def absolute_path(self) -> str:
 64        """Fully URI to file object in AWS S3"""
 65        return self._file.absolute_path
 66
 67    @property
 68    def metadata(self) -> dict:
 69        """File metadata"""
 70        return self._file.metadata
 71
 72    @property
 73    def size_bytes(self) -> int:
 74        """File size (in bytes)"""
 75        return self._file.size
 76
 77    @property
 78    def size(self) -> str:
 79        """File size converted to human-readable (e.g., 4.50 GB)"""
 80        return convert_size(self._file.size)
 81
 82    def __str__(self):
 83        return f"{self.relative_path} ({self.size})"
 84
 85    def _get(self) -> bytes:
 86        """Internal method to call client.file.get_file"""
 87
 88        return self._client.file.get_file(self._file)
 89
 90    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame':
 91        """
 92        Parse the file as a Pandas DataFrame.
 93
 94        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 95
 96        File compression is inferred from the extension, but can be set
 97        explicitly with the compression= flag.
 98
 99        All other keyword arguments are passed to pandas.read_csv
100        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
101        """
102        import pandas
103
104        if compression == 'infer':
105            # If the file appears to be compressed
106            if self.relative_path.endswith('.gz'):
107                compression = dict(method='gzip')
108            elif self.relative_path.endswith('.bz2'):
109                compression = dict(method='bz2')
110            elif self.relative_path.endswith('.xz'):
111                compression = dict(method='zstd')
112            elif self.relative_path.endswith('.zst'):
113                compression = dict(method='zstd')
114            else:
115                compression = None
116
117        if compression is not None:
118            handle = BytesIO(self._get())
119        else:
120            handle = StringIO(self._get().decode(encoding))
121
122        df = pandas.read_csv(
123            handle,
124            compression=compression,
125            encoding=encoding,
126            **kwargs
127        )
128        handle.close()
129        return df
130
131    def read_h5ad(self) -> 'anndata.AnnData':
132        """Read an AnnData object from a file."""
133        # Import the anndata library, and raise an error if it is not available
134        try:
135            import anndata as ad # noqa
136        except ImportError:
137            raise ImportError("The anndata library is required to read AnnData files. "
138                              "Please install it using 'pip install anndata'.")
139
140        # Download the file to a temporary file handle and parse the contents
141        with BytesIO(self._get()) as handle:
142            return ad.read_h5ad(handle)
143
144    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
145        """Read the file contents as a list of lines."""
146
147        return self.read(
148            encoding=encoding,
149            compression=compression
150        ).splitlines()
151
152    def read(self, encoding='utf-8', compression=None) -> str:
153        """Read the file contents as text."""
154
155        # Get the raw file contents
156        cont = self._get()
157
158        # If the file is uncompressed
159        if compression is None:
160            return cont.decode(encoding)
161        # If the file is compressed
162        else:
163
164            # Only gzip-compression is supported currently
165            if compression != "gzip":
166                raise DataPortalInputError("compression may be 'gzip' or None")
167
168            with gzip.open(
169                BytesIO(
170                    cont
171                ),
172                'rt',
173                encoding=encoding
174            ) as handle:
175                return handle.read()
176
177    def download(self, download_location: str = None):
178        """Download the file to a local directory."""
179
180        if download_location is None:
181            raise DataPortalInputError("Must provide download location")
182
183        self._client.file.download_files(
184            self._file.access_context,
185            download_location,
186            [self.relative_path]
187        )
188
189    def validate(self, local_path: PathLike):
190        """
191        Validate that the local file matches the remote file by comparing checksums.
192
193        Args:
194            local_path (PathLike): Path to the local file to validate
195        Raises:
196            ValueError: If checksums do not match
197            RuntimeWarning: If the remote checksum is not available or not supported
198        """
199        self._client.file.validate_file(self._file, local_path)
200
201    def is_valid(self, local_path: PathLike) -> bool:
202        """
203        Check if the local file matches the remote file by comparing checksums.
204
205        Args:
206            local_path (PathLike): Path to the local file to validate
207        Returns:
208            bool: True if the local file matches the remote file, False otherwise
209        Raises:
210            RuntimeWarning: If the remote checksum is not available or not supported
211        """
212        if not local_path:
213            raise DataPortalInputError("Must provide local path to validate file")
214
215        try:
216            self.validate(local_path)
217            return True
218        except ValueError:
219            return False
220
221
222class DataPortalFiles(DataPortalAssets[DataPortalFile]):
223    """Collection of DataPortalFile objects."""
224    asset_name = "file"
225
226    def download(self, download_location: str = None) -> None:
227        """Download the collection of files to a local directory."""
228
229        for f in self:
230            f.download(download_location)
class DataPortalFile(cirro.sdk.asset.DataPortalAsset):
 18class DataPortalFile(DataPortalAsset):
 19    """
 20    Datasets are made up of a collection of File objects in the Data Portal.
 21    """
 22
 23    def __init__(self, file: File, client: CirroApi):
 24        """
 25        Instantiate by listing files from a dataset.
 26
 27        ```python
 28        from cirro import DataPortal()
 29        portal = DataPortal()
 30        dataset = portal.get_dataset(
 31            project="id-or-name-of-project",
 32            dataset="id-or-name-of-dataset"
 33        )
 34        files = dataset.list_files()
 35        ```
 36        """
 37        # Attach the file object
 38        self._file = file
 39        self._client = client
 40
 41    # Note that the 'name' and 'id' attributes are set to the relative path
 42    # The purpose of this is to support the DataPortalAssets class functions
 43    @property
 44    def id(self) -> str:
 45        """Relative path of file within the dataset"""
 46        return self._file.relative_path
 47
 48    @property
 49    def name(self) -> str:
 50        """Relative path of file within the dataset"""
 51        return self._file.relative_path
 52
 53    @property
 54    def file_name(self) -> str:
 55        """Name of file, excluding the full folder path within the dataset"""
 56        return self._file.name
 57
 58    @property
 59    def relative_path(self) -> str:
 60        """Relative path of file within the dataset"""
 61        return self._file.relative_path
 62
 63    @property
 64    def absolute_path(self) -> str:
 65        """Fully URI to file object in AWS S3"""
 66        return self._file.absolute_path
 67
 68    @property
 69    def metadata(self) -> dict:
 70        """File metadata"""
 71        return self._file.metadata
 72
 73    @property
 74    def size_bytes(self) -> int:
 75        """File size (in bytes)"""
 76        return self._file.size
 77
 78    @property
 79    def size(self) -> str:
 80        """File size converted to human-readable (e.g., 4.50 GB)"""
 81        return convert_size(self._file.size)
 82
 83    def __str__(self):
 84        return f"{self.relative_path} ({self.size})"
 85
 86    def _get(self) -> bytes:
 87        """Internal method to call client.file.get_file"""
 88
 89        return self._client.file.get_file(self._file)
 90
 91    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame':
 92        """
 93        Parse the file as a Pandas DataFrame.
 94
 95        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 96
 97        File compression is inferred from the extension, but can be set
 98        explicitly with the compression= flag.
 99
100        All other keyword arguments are passed to pandas.read_csv
101        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
102        """
103        import pandas
104
105        if compression == 'infer':
106            # If the file appears to be compressed
107            if self.relative_path.endswith('.gz'):
108                compression = dict(method='gzip')
109            elif self.relative_path.endswith('.bz2'):
110                compression = dict(method='bz2')
111            elif self.relative_path.endswith('.xz'):
112                compression = dict(method='zstd')
113            elif self.relative_path.endswith('.zst'):
114                compression = dict(method='zstd')
115            else:
116                compression = None
117
118        if compression is not None:
119            handle = BytesIO(self._get())
120        else:
121            handle = StringIO(self._get().decode(encoding))
122
123        df = pandas.read_csv(
124            handle,
125            compression=compression,
126            encoding=encoding,
127            **kwargs
128        )
129        handle.close()
130        return df
131
132    def read_h5ad(self) -> 'anndata.AnnData':
133        """Read an AnnData object from a file."""
134        # Import the anndata library, and raise an error if it is not available
135        try:
136            import anndata as ad # noqa
137        except ImportError:
138            raise ImportError("The anndata library is required to read AnnData files. "
139                              "Please install it using 'pip install anndata'.")
140
141        # Download the file to a temporary file handle and parse the contents
142        with BytesIO(self._get()) as handle:
143            return ad.read_h5ad(handle)
144
145    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
146        """Read the file contents as a list of lines."""
147
148        return self.read(
149            encoding=encoding,
150            compression=compression
151        ).splitlines()
152
153    def read(self, encoding='utf-8', compression=None) -> str:
154        """Read the file contents as text."""
155
156        # Get the raw file contents
157        cont = self._get()
158
159        # If the file is uncompressed
160        if compression is None:
161            return cont.decode(encoding)
162        # If the file is compressed
163        else:
164
165            # Only gzip-compression is supported currently
166            if compression != "gzip":
167                raise DataPortalInputError("compression may be 'gzip' or None")
168
169            with gzip.open(
170                BytesIO(
171                    cont
172                ),
173                'rt',
174                encoding=encoding
175            ) as handle:
176                return handle.read()
177
178    def download(self, download_location: str = None):
179        """Download the file to a local directory."""
180
181        if download_location is None:
182            raise DataPortalInputError("Must provide download location")
183
184        self._client.file.download_files(
185            self._file.access_context,
186            download_location,
187            [self.relative_path]
188        )
189
190    def validate(self, local_path: PathLike):
191        """
192        Validate that the local file matches the remote file by comparing checksums.
193
194        Args:
195            local_path (PathLike): Path to the local file to validate
196        Raises:
197            ValueError: If checksums do not match
198            RuntimeWarning: If the remote checksum is not available or not supported
199        """
200        self._client.file.validate_file(self._file, local_path)
201
202    def is_valid(self, local_path: PathLike) -> bool:
203        """
204        Check if the local file matches the remote file by comparing checksums.
205
206        Args:
207            local_path (PathLike): Path to the local file to validate
208        Returns:
209            bool: True if the local file matches the remote file, False otherwise
210        Raises:
211            RuntimeWarning: If the remote checksum is not available or not supported
212        """
213        if not local_path:
214            raise DataPortalInputError("Must provide local path to validate file")
215
216        try:
217            self.validate(local_path)
218            return True
219        except ValueError:
220            return False

Datasets are made up of a collection of File objects in the Data Portal.

DataPortalFile(file: cirro.models.file.File, client: cirro.CirroApi)
23    def __init__(self, file: File, client: CirroApi):
24        """
25        Instantiate by listing files from a dataset.
26
27        ```python
28        from cirro import DataPortal()
29        portal = DataPortal()
30        dataset = portal.get_dataset(
31            project="id-or-name-of-project",
32            dataset="id-or-name-of-dataset"
33        )
34        files = dataset.list_files()
35        ```
36        """
37        # Attach the file object
38        self._file = file
39        self._client = client

Instantiate by listing files from a dataset.

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
files = dataset.list_files()
id: str
43    @property
44    def id(self) -> str:
45        """Relative path of file within the dataset"""
46        return self._file.relative_path

Relative path of file within the dataset

name: str
48    @property
49    def name(self) -> str:
50        """Relative path of file within the dataset"""
51        return self._file.relative_path

Relative path of file within the dataset

file_name: str
53    @property
54    def file_name(self) -> str:
55        """Name of file, excluding the full folder path within the dataset"""
56        return self._file.name

Name of file, excluding the full folder path within the dataset

relative_path: str
58    @property
59    def relative_path(self) -> str:
60        """Relative path of file within the dataset"""
61        return self._file.relative_path

Relative path of file within the dataset

absolute_path: str
63    @property
64    def absolute_path(self) -> str:
65        """Fully URI to file object in AWS S3"""
66        return self._file.absolute_path

Fully URI to file object in AWS S3

metadata: dict
68    @property
69    def metadata(self) -> dict:
70        """File metadata"""
71        return self._file.metadata

File metadata

size_bytes: int
73    @property
74    def size_bytes(self) -> int:
75        """File size (in bytes)"""
76        return self._file.size

File size (in bytes)

size: str
78    @property
79    def size(self) -> str:
80        """File size converted to human-readable (e.g., 4.50 GB)"""
81        return convert_size(self._file.size)

File size converted to human-readable (e.g., 4.50 GB)

def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame':
 91    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFrame':
 92        """
 93        Parse the file as a Pandas DataFrame.
 94
 95        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 96
 97        File compression is inferred from the extension, but can be set
 98        explicitly with the compression= flag.
 99
100        All other keyword arguments are passed to pandas.read_csv
101        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
102        """
103        import pandas
104
105        if compression == 'infer':
106            # If the file appears to be compressed
107            if self.relative_path.endswith('.gz'):
108                compression = dict(method='gzip')
109            elif self.relative_path.endswith('.bz2'):
110                compression = dict(method='bz2')
111            elif self.relative_path.endswith('.xz'):
112                compression = dict(method='zstd')
113            elif self.relative_path.endswith('.zst'):
114                compression = dict(method='zstd')
115            else:
116                compression = None
117
118        if compression is not None:
119            handle = BytesIO(self._get())
120        else:
121            handle = StringIO(self._get().decode(encoding))
122
123        df = pandas.read_csv(
124            handle,
125            compression=compression,
126            encoding=encoding,
127            **kwargs
128        )
129        handle.close()
130        return df

Parse the file as a Pandas DataFrame.

The default field separator is a comma (for CSV), use sep='\t' for TSV.

File compression is inferred from the extension, but can be set explicitly with the compression= flag.

All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

def read_h5ad(self) -> 'anndata.AnnData':
132    def read_h5ad(self) -> 'anndata.AnnData':
133        """Read an AnnData object from a file."""
134        # Import the anndata library, and raise an error if it is not available
135        try:
136            import anndata as ad # noqa
137        except ImportError:
138            raise ImportError("The anndata library is required to read AnnData files. "
139                              "Please install it using 'pip install anndata'.")
140
141        # Download the file to a temporary file handle and parse the contents
142        with BytesIO(self._get()) as handle:
143            return ad.read_h5ad(handle)

Read an AnnData object from a file.

def readlines(self, encoding='utf-8', compression=None) -> List[str]:
145    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
146        """Read the file contents as a list of lines."""
147
148        return self.read(
149            encoding=encoding,
150            compression=compression
151        ).splitlines()

Read the file contents as a list of lines.

def read(self, encoding='utf-8', compression=None) -> str:
153    def read(self, encoding='utf-8', compression=None) -> str:
154        """Read the file contents as text."""
155
156        # Get the raw file contents
157        cont = self._get()
158
159        # If the file is uncompressed
160        if compression is None:
161            return cont.decode(encoding)
162        # If the file is compressed
163        else:
164
165            # Only gzip-compression is supported currently
166            if compression != "gzip":
167                raise DataPortalInputError("compression may be 'gzip' or None")
168
169            with gzip.open(
170                BytesIO(
171                    cont
172                ),
173                'rt',
174                encoding=encoding
175            ) as handle:
176                return handle.read()

Read the file contents as text.

def download(self, download_location: str = None):
178    def download(self, download_location: str = None):
179        """Download the file to a local directory."""
180
181        if download_location is None:
182            raise DataPortalInputError("Must provide download location")
183
184        self._client.file.download_files(
185            self._file.access_context,
186            download_location,
187            [self.relative_path]
188        )

Download the file to a local directory.

def validate(self, local_path: ~PathLike):
190    def validate(self, local_path: PathLike):
191        """
192        Validate that the local file matches the remote file by comparing checksums.
193
194        Args:
195            local_path (PathLike): Path to the local file to validate
196        Raises:
197            ValueError: If checksums do not match
198            RuntimeWarning: If the remote checksum is not available or not supported
199        """
200        self._client.file.validate_file(self._file, local_path)

Validate that the local file matches the remote file by comparing checksums.

Arguments:
  • local_path (PathLike): Path to the local file to validate
Raises:
  • ValueError: If checksums do not match
  • RuntimeWarning: If the remote checksum is not available or not supported
def is_valid(self, local_path: ~PathLike) -> bool:
202    def is_valid(self, local_path: PathLike) -> bool:
203        """
204        Check if the local file matches the remote file by comparing checksums.
205
206        Args:
207            local_path (PathLike): Path to the local file to validate
208        Returns:
209            bool: True if the local file matches the remote file, False otherwise
210        Raises:
211            RuntimeWarning: If the remote checksum is not available or not supported
212        """
213        if not local_path:
214            raise DataPortalInputError("Must provide local path to validate file")
215
216        try:
217            self.validate(local_path)
218            return True
219        except ValueError:
220            return False

Check if the local file matches the remote file by comparing checksums.

Arguments:
  • local_path (PathLike): Path to the local file to validate
Returns:

bool: True if the local file matches the remote file, False otherwise

Raises:
  • RuntimeWarning: If the remote checksum is not available or not supported
223class DataPortalFiles(DataPortalAssets[DataPortalFile]):
224    """Collection of DataPortalFile objects."""
225    asset_name = "file"
226
227    def download(self, download_location: str = None) -> None:
228        """Download the collection of files to a local directory."""
229
230        for f in self:
231            f.download(download_location)

Collection of DataPortalFile objects.

asset_name = 'file'
def download(self, download_location: str = None) -> None:
227    def download(self, download_location: str = None) -> None:
228        """Download the collection of files to a local directory."""
229
230        for f in self:
231            f.download(download_location)

Download the collection of files to a local directory.