cirro.sdk.file

  1import gzip
  2from io import BytesIO, StringIO
  3from typing import List
  4
  5import pandas as pd
  6
  7from typing import TYPE_CHECKING
  8if TYPE_CHECKING:
  9    import anndata
 10
 11from cirro.cirro_client import CirroApi
 12from cirro.models.file import File
 13from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 14from cirro.sdk.exceptions import DataPortalInputError
 15from cirro.utils import convert_size
 16
 17
 18class DataPortalFile(DataPortalAsset):
 19    """
 20    Datasets are made up of a collection of File objects in the Data Portal.
 21    """
 22
 23    def __init__(self, file: File, client: CirroApi):
 24        """
 25        Instantiate by listing files from a dataset.
 26
 27        ```python
 28        from cirro import DataPortal()
 29        portal = DataPortal()
 30        dataset = portal.get_dataset(
 31            project="id-or-name-of-project",
 32            dataset="id-or-name-of-dataset"
 33        )
 34        files = dataset.list_files()
 35        ```
 36        """
 37        # Attach the file object
 38        self._file = file
 39        self._client = client
 40
 41    # Note that the 'name' and 'id' attributes are set to the relative path
 42    # The purpose of this is to support the DataPortalAssets class functions
 43    @property
 44    def id(self) -> str:
 45        """Relative path of file within the dataset"""
 46        return self._file.relative_path
 47
 48    @property
 49    def name(self) -> str:
 50        """Relative path of file within the dataset"""
 51        return self._file.relative_path
 52
 53    @property
 54    def file_name(self) -> str:
 55        """Name of file, excluding the full folder path within the dataset"""
 56        return self._file.name
 57
 58    @property
 59    def relative_path(self) -> str:
 60        """Relative path of file within the dataset"""
 61        return self._file.relative_path
 62
 63    @property
 64    def absolute_path(self) -> str:
 65        """Fully URI to file object in AWS S3"""
 66        return self._file.absolute_path
 67
 68    @property
 69    def metadata(self) -> dict:
 70        """File metadata"""
 71        return self._file.metadata
 72
 73    @property
 74    def size_bytes(self) -> int:
 75        """File size (in bytes)"""
 76        return self._file.size
 77
 78    @property
 79    def size(self) -> str:
 80        """File size converted to human-readable (e.g., 4.50 GB)"""
 81        return convert_size(self._file.size)
 82
 83    def __str__(self):
 84        return f"{self.relative_path} ({self.size})"
 85
 86    def _get(self) -> bytes:
 87        """Internal method to call client.file.get_file"""
 88
 89        return self._client.file.get_file(self._file)
 90
 91    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 92        """
 93        Parse the file as a Pandas DataFrame.
 94
 95        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 96
 97        File compression is inferred from the extension, but can be set
 98        explicitly with the compression= flag.
 99
100        All other keyword arguments are passed to pandas.read_csv
101        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
102        """
103
104        if compression == 'infer':
105            # If the file appears to be compressed
106            if self.relative_path.endswith('.gz'):
107                compression = dict(method='gzip')
108            elif self.relative_path.endswith('.bz2'):
109                compression = dict(method='bz2')
110            elif self.relative_path.endswith('.xz'):
111                compression = dict(method='zstd')
112            elif self.relative_path.endswith('.zst'):
113                compression = dict(method='zstd')
114            else:
115                compression = None
116
117        if compression is not None:
118            handle = BytesIO(self._get())
119        else:
120            handle = StringIO(self._get().decode(encoding))
121
122        df = pd.read_csv(
123            handle,
124            compression=compression,
125            encoding=encoding,
126            **kwargs
127        )
128        handle.close()
129        return df
130
131    def read_h5ad(self) -> 'anndata.AnnData':
132        """Read an AnnData object from a file."""
133        # Import the anndata library, and raise an error if it is not available
134        try:
135            import anndata as ad # noqa
136        except ImportError:
137            raise ImportError("The anndata library is required to read AnnData files. "
138                              "Please install it using 'pip install anndata'.")
139
140        # Download the file to a temporary file handle and parse the contents
141        with BytesIO(self._get()) as handle:
142            return ad.read_h5ad(handle)
143
144    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
145        """Read the file contents as a list of lines."""
146
147        return self.read(
148            encoding=encoding,
149            compression=compression
150        ).splitlines()
151
152    def read(self, encoding='utf-8', compression=None) -> str:
153        """Read the file contents as text."""
154
155        # Get the raw file contents
156        cont = self._get()
157
158        # If the file is uncompressed
159        if compression is None:
160            return cont.decode(encoding)
161        # If the file is compressed
162        else:
163
164            # Only gzip-compression is supported currently
165            if compression != "gzip":
166                raise DataPortalInputError("compression may be 'gzip' or None")
167
168            with gzip.open(
169                BytesIO(
170                    cont
171                ),
172                'rt',
173                encoding=encoding
174            ) as handle:
175                return handle.read()
176
177    def download(self, download_location: str = None):
178        """Download the file to a local directory."""
179
180        if download_location is None:
181            raise DataPortalInputError("Must provide download location")
182
183        self._client.file.download_files(
184            self._file.access_context,
185            download_location,
186            [self.relative_path]
187        )
188
189
190class DataPortalFiles(DataPortalAssets[DataPortalFile]):
191    """Collection of DataPortalFile objects."""
192    asset_name = "file"
193
194    def download(self, download_location: str = None) -> None:
195        """Download the collection of files to a local directory."""
196
197        for f in self:
198            f.download(download_location)
class DataPortalFile(cirro.sdk.asset.DataPortalAsset):
 19class DataPortalFile(DataPortalAsset):
 20    """
 21    Datasets are made up of a collection of File objects in the Data Portal.
 22    """
 23
 24    def __init__(self, file: File, client: CirroApi):
 25        """
 26        Instantiate by listing files from a dataset.
 27
 28        ```python
 29        from cirro import DataPortal()
 30        portal = DataPortal()
 31        dataset = portal.get_dataset(
 32            project="id-or-name-of-project",
 33            dataset="id-or-name-of-dataset"
 34        )
 35        files = dataset.list_files()
 36        ```
 37        """
 38        # Attach the file object
 39        self._file = file
 40        self._client = client
 41
 42    # Note that the 'name' and 'id' attributes are set to the relative path
 43    # The purpose of this is to support the DataPortalAssets class functions
 44    @property
 45    def id(self) -> str:
 46        """Relative path of file within the dataset"""
 47        return self._file.relative_path
 48
 49    @property
 50    def name(self) -> str:
 51        """Relative path of file within the dataset"""
 52        return self._file.relative_path
 53
 54    @property
 55    def file_name(self) -> str:
 56        """Name of file, excluding the full folder path within the dataset"""
 57        return self._file.name
 58
 59    @property
 60    def relative_path(self) -> str:
 61        """Relative path of file within the dataset"""
 62        return self._file.relative_path
 63
 64    @property
 65    def absolute_path(self) -> str:
 66        """Fully URI to file object in AWS S3"""
 67        return self._file.absolute_path
 68
 69    @property
 70    def metadata(self) -> dict:
 71        """File metadata"""
 72        return self._file.metadata
 73
 74    @property
 75    def size_bytes(self) -> int:
 76        """File size (in bytes)"""
 77        return self._file.size
 78
 79    @property
 80    def size(self) -> str:
 81        """File size converted to human-readable (e.g., 4.50 GB)"""
 82        return convert_size(self._file.size)
 83
 84    def __str__(self):
 85        return f"{self.relative_path} ({self.size})"
 86
 87    def _get(self) -> bytes:
 88        """Internal method to call client.file.get_file"""
 89
 90        return self._client.file.get_file(self._file)
 91
 92    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 93        """
 94        Parse the file as a Pandas DataFrame.
 95
 96        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 97
 98        File compression is inferred from the extension, but can be set
 99        explicitly with the compression= flag.
100
101        All other keyword arguments are passed to pandas.read_csv
102        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
103        """
104
105        if compression == 'infer':
106            # If the file appears to be compressed
107            if self.relative_path.endswith('.gz'):
108                compression = dict(method='gzip')
109            elif self.relative_path.endswith('.bz2'):
110                compression = dict(method='bz2')
111            elif self.relative_path.endswith('.xz'):
112                compression = dict(method='zstd')
113            elif self.relative_path.endswith('.zst'):
114                compression = dict(method='zstd')
115            else:
116                compression = None
117
118        if compression is not None:
119            handle = BytesIO(self._get())
120        else:
121            handle = StringIO(self._get().decode(encoding))
122
123        df = pd.read_csv(
124            handle,
125            compression=compression,
126            encoding=encoding,
127            **kwargs
128        )
129        handle.close()
130        return df
131
132    def read_h5ad(self) -> 'anndata.AnnData':
133        """Read an AnnData object from a file."""
134        # Import the anndata library, and raise an error if it is not available
135        try:
136            import anndata as ad # noqa
137        except ImportError:
138            raise ImportError("The anndata library is required to read AnnData files. "
139                              "Please install it using 'pip install anndata'.")
140
141        # Download the file to a temporary file handle and parse the contents
142        with BytesIO(self._get()) as handle:
143            return ad.read_h5ad(handle)
144
145    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
146        """Read the file contents as a list of lines."""
147
148        return self.read(
149            encoding=encoding,
150            compression=compression
151        ).splitlines()
152
153    def read(self, encoding='utf-8', compression=None) -> str:
154        """Read the file contents as text."""
155
156        # Get the raw file contents
157        cont = self._get()
158
159        # If the file is uncompressed
160        if compression is None:
161            return cont.decode(encoding)
162        # If the file is compressed
163        else:
164
165            # Only gzip-compression is supported currently
166            if compression != "gzip":
167                raise DataPortalInputError("compression may be 'gzip' or None")
168
169            with gzip.open(
170                BytesIO(
171                    cont
172                ),
173                'rt',
174                encoding=encoding
175            ) as handle:
176                return handle.read()
177
178    def download(self, download_location: str = None):
179        """Download the file to a local directory."""
180
181        if download_location is None:
182            raise DataPortalInputError("Must provide download location")
183
184        self._client.file.download_files(
185            self._file.access_context,
186            download_location,
187            [self.relative_path]
188        )

Datasets are made up of a collection of File objects in the Data Portal.

DataPortalFile(file: cirro.models.file.File, client: cirro.CirroApi)
24    def __init__(self, file: File, client: CirroApi):
25        """
26        Instantiate by listing files from a dataset.
27
28        ```python
29        from cirro import DataPortal()
30        portal = DataPortal()
31        dataset = portal.get_dataset(
32            project="id-or-name-of-project",
33            dataset="id-or-name-of-dataset"
34        )
35        files = dataset.list_files()
36        ```
37        """
38        # Attach the file object
39        self._file = file
40        self._client = client

Instantiate by listing files from a dataset.

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
files = dataset.list_files()
id: str
44    @property
45    def id(self) -> str:
46        """Relative path of file within the dataset"""
47        return self._file.relative_path

Relative path of file within the dataset

name: str
49    @property
50    def name(self) -> str:
51        """Relative path of file within the dataset"""
52        return self._file.relative_path

Relative path of file within the dataset

file_name: str
54    @property
55    def file_name(self) -> str:
56        """Name of file, excluding the full folder path within the dataset"""
57        return self._file.name

Name of file, excluding the full folder path within the dataset

relative_path: str
59    @property
60    def relative_path(self) -> str:
61        """Relative path of file within the dataset"""
62        return self._file.relative_path

Relative path of file within the dataset

absolute_path: str
64    @property
65    def absolute_path(self) -> str:
66        """Fully URI to file object in AWS S3"""
67        return self._file.absolute_path

Fully URI to file object in AWS S3

metadata: dict
69    @property
70    def metadata(self) -> dict:
71        """File metadata"""
72        return self._file.metadata

File metadata

size_bytes: int
74    @property
75    def size_bytes(self) -> int:
76        """File size (in bytes)"""
77        return self._file.size

File size (in bytes)

size: str
79    @property
80    def size(self) -> str:
81        """File size converted to human-readable (e.g., 4.50 GB)"""
82        return convert_size(self._file.size)

File size converted to human-readable (e.g., 4.50 GB)

def read_csv( self, compression='infer', encoding='utf-8', **kwargs) -> pandas.core.frame.DataFrame:
 92    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 93        """
 94        Parse the file as a Pandas DataFrame.
 95
 96        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 97
 98        File compression is inferred from the extension, but can be set
 99        explicitly with the compression= flag.
100
101        All other keyword arguments are passed to pandas.read_csv
102        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
103        """
104
105        if compression == 'infer':
106            # If the file appears to be compressed
107            if self.relative_path.endswith('.gz'):
108                compression = dict(method='gzip')
109            elif self.relative_path.endswith('.bz2'):
110                compression = dict(method='bz2')
111            elif self.relative_path.endswith('.xz'):
112                compression = dict(method='zstd')
113            elif self.relative_path.endswith('.zst'):
114                compression = dict(method='zstd')
115            else:
116                compression = None
117
118        if compression is not None:
119            handle = BytesIO(self._get())
120        else:
121            handle = StringIO(self._get().decode(encoding))
122
123        df = pd.read_csv(
124            handle,
125            compression=compression,
126            encoding=encoding,
127            **kwargs
128        )
129        handle.close()
130        return df

Parse the file as a Pandas DataFrame.

The default field separator is a comma (for CSV), use sep='\t' for TSV.

File compression is inferred from the extension, but can be set explicitly with the compression= flag.

All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

def read_h5ad(self) -> 'anndata.AnnData':
132    def read_h5ad(self) -> 'anndata.AnnData':
133        """Read an AnnData object from a file."""
134        # Import the anndata library, and raise an error if it is not available
135        try:
136            import anndata as ad # noqa
137        except ImportError:
138            raise ImportError("The anndata library is required to read AnnData files. "
139                              "Please install it using 'pip install anndata'.")
140
141        # Download the file to a temporary file handle and parse the contents
142        with BytesIO(self._get()) as handle:
143            return ad.read_h5ad(handle)

Read an AnnData object from a file.

def readlines(self, encoding='utf-8', compression=None) -> List[str]:
145    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
146        """Read the file contents as a list of lines."""
147
148        return self.read(
149            encoding=encoding,
150            compression=compression
151        ).splitlines()

Read the file contents as a list of lines.

def read(self, encoding='utf-8', compression=None) -> str:
153    def read(self, encoding='utf-8', compression=None) -> str:
154        """Read the file contents as text."""
155
156        # Get the raw file contents
157        cont = self._get()
158
159        # If the file is uncompressed
160        if compression is None:
161            return cont.decode(encoding)
162        # If the file is compressed
163        else:
164
165            # Only gzip-compression is supported currently
166            if compression != "gzip":
167                raise DataPortalInputError("compression may be 'gzip' or None")
168
169            with gzip.open(
170                BytesIO(
171                    cont
172                ),
173                'rt',
174                encoding=encoding
175            ) as handle:
176                return handle.read()

Read the file contents as text.

def download(self, download_location: str = None):
178    def download(self, download_location: str = None):
179        """Download the file to a local directory."""
180
181        if download_location is None:
182            raise DataPortalInputError("Must provide download location")
183
184        self._client.file.download_files(
185            self._file.access_context,
186            download_location,
187            [self.relative_path]
188        )

Download the file to a local directory.

191class DataPortalFiles(DataPortalAssets[DataPortalFile]):
192    """Collection of DataPortalFile objects."""
193    asset_name = "file"
194
195    def download(self, download_location: str = None) -> None:
196        """Download the collection of files to a local directory."""
197
198        for f in self:
199            f.download(download_location)

Collection of DataPortalFile objects.

asset_name = 'file'
def download(self, download_location: str = None) -> None:
195    def download(self, download_location: str = None) -> None:
196        """Download the collection of files to a local directory."""
197
198        for f in self:
199            f.download(download_location)

Download the collection of files to a local directory.