cirro.sdk.file

View Source

  1import gzip
  2from io import BytesIO, StringIO
  3from typing import List
  4
  5import pandas as pd
  6
  7from cirro.cirro_client import CirroApi
  8from cirro.models.file import File
  9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 10from cirro.sdk.exceptions import DataPortalInputError
 11from cirro.utils import convert_size
 12
 13
 14class DataPortalFile(DataPortalAsset):
 15    """
 16    Datasets are made up of a collection of File objects in the Data Portal.
 17    """
 18
 19    def __init__(self, file: File, client: CirroApi):
 20        """
 21        Instantiate by listing files from a dataset.
 22
 23        ```python
 24        from cirro import DataPortal()
 25        portal = DataPortal()
 26        dataset = portal.get_dataset(
 27            project="id-or-name-of-project",
 28            dataset="id-or-name-of-dataset"
 29        )
 30        files = dataset.list_files()
 31        ```
 32        """
 33        # Attach the file object
 34        self._file = file
 35        self._client = client
 36
 37    # Note that the 'name' and 'id' attributes are set to the relative path
 38    # The purpose of this is to support the DataPortalAssets class functions
 39    @property
 40    def id(self) -> str:
 41        """Relative path of file within the dataset"""
 42        return self._file.relative_path
 43
 44    @property
 45    def name(self) -> str:
 46        """Relative path of file within the dataset"""
 47        return self._file.relative_path
 48
 49    @property
 50    def file_name(self) -> str:
 51        """Name of file, excluding the full folder path within the dataset"""
 52        return self._file.name
 53
 54    @property
 55    def relative_path(self) -> str:
 56        """Relative path of file within the dataset"""
 57        return self._file.relative_path
 58
 59    @property
 60    def absolute_path(self) -> str:
 61        """Fully URI to file object in AWS S3"""
 62        return self._file.absolute_path
 63
 64    @property
 65    def metadata(self) -> dict:
 66        """File metadata"""
 67        return self._file.metadata
 68
 69    @property
 70    def size_bytes(self) -> int:
 71        """File size (in bytes)"""
 72        return self._file.size
 73
 74    @property
 75    def size(self) -> str:
 76        """File size converted to human-readable (e.g., 4.50 GB)"""
 77        return convert_size(self._file.size)
 78
 79    def __str__(self):
 80        return f"{self.relative_path} ({self.size})"
 81
 82    def _get(self) -> bytes:
 83        """Internal method to call client.file.get_file"""
 84
 85        return self._client.file.get_file(self._file)
 86
 87    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 88        """
 89        Parse the file as a Pandas DataFrame.
 90
 91        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 92
 93        File compression is inferred from the extension, but can be set
 94        explicitly with the compression= flag.
 95
 96        All other keyword arguments are passed to pandas.read_csv
 97        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
 98        """
 99
100        if compression == 'infer':
101            # If the file appears to be compressed
102            if self.relative_path.endswith('.gz'):
103                compression = dict(method='gzip')
104            elif self.relative_path.endswith('.bz2'):
105                compression = dict(method='bz2')
106            elif self.relative_path.endswith('.xz'):
107                compression = dict(method='zstd')
108            elif self.relative_path.endswith('.zst'):
109                compression = dict(method='zstd')
110            else:
111                compression = None
112
113        if compression is not None:
114            handle = BytesIO(self._get())
115        else:
116            handle = StringIO(self._get().decode(encoding))
117
118        df = pd.read_csv(
119            handle,
120            compression=compression,
121            encoding=encoding,
122            **kwargs
123        )
124        handle.close()
125        return df
126
127    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
128        """Read the file contents as a list of lines."""
129
130        return self.read(
131            encoding=encoding,
132            compression=compression
133        ).splitlines()
134
135    def read(self, encoding='utf-8', compression=None) -> str:
136        """Read the file contents as text."""
137
138        # Get the raw file contents
139        cont = self._get()
140
141        # If the file is uncompressed
142        if compression is None:
143            return cont.decode(encoding)
144        # If the file is compressed
145        else:
146
147            # Only gzip-compression is supported currently
148            if compression != "gzip":
149                raise DataPortalInputError("compression may be 'gzip' or None")
150
151            with gzip.open(
152                BytesIO(
153                    cont
154                ),
155                'rt',
156                encoding=encoding
157            ) as handle:
158                return handle.read()
159
160    def download(self, download_location: str = None):
161        """Download the file to a local directory."""
162
163        if download_location is None:
164            raise DataPortalInputError("Must provide download location")
165
166        self._client.file.download_files(
167            self._file.access_context,
168            download_location,
169            [self.relative_path]
170        )
171
172
173class DataPortalFiles(DataPortalAssets[DataPortalFile]):
174    """Collection of DataPortalFile objects."""
175    asset_name = "file"
176
177    def download(self, download_location: str = None) -> None:
178        """Download the collection of files to a local directory."""
179
180        for f in self:
181            f.download(download_location)

class DataPortalFile(cirro.sdk.asset.DataPortalAsset): View Source

 15class DataPortalFile(DataPortalAsset):
 16    """
 17    Datasets are made up of a collection of File objects in the Data Portal.
 18    """
 19
 20    def __init__(self, file: File, client: CirroApi):
 21        """
 22        Instantiate by listing files from a dataset.
 23
 24        ```python
 25        from cirro import DataPortal()
 26        portal = DataPortal()
 27        dataset = portal.get_dataset(
 28            project="id-or-name-of-project",
 29            dataset="id-or-name-of-dataset"
 30        )
 31        files = dataset.list_files()
 32        ```
 33        """
 34        # Attach the file object
 35        self._file = file
 36        self._client = client
 37
 38    # Note that the 'name' and 'id' attributes are set to the relative path
 39    # The purpose of this is to support the DataPortalAssets class functions
 40    @property
 41    def id(self) -> str:
 42        """Relative path of file within the dataset"""
 43        return self._file.relative_path
 44
 45    @property
 46    def name(self) -> str:
 47        """Relative path of file within the dataset"""
 48        return self._file.relative_path
 49
 50    @property
 51    def file_name(self) -> str:
 52        """Name of file, excluding the full folder path within the dataset"""
 53        return self._file.name
 54
 55    @property
 56    def relative_path(self) -> str:
 57        """Relative path of file within the dataset"""
 58        return self._file.relative_path
 59
 60    @property
 61    def absolute_path(self) -> str:
 62        """Fully URI to file object in AWS S3"""
 63        return self._file.absolute_path
 64
 65    @property
 66    def metadata(self) -> dict:
 67        """File metadata"""
 68        return self._file.metadata
 69
 70    @property
 71    def size_bytes(self) -> int:
 72        """File size (in bytes)"""
 73        return self._file.size
 74
 75    @property
 76    def size(self) -> str:
 77        """File size converted to human-readable (e.g., 4.50 GB)"""
 78        return convert_size(self._file.size)
 79
 80    def __str__(self):
 81        return f"{self.relative_path} ({self.size})"
 82
 83    def _get(self) -> bytes:
 84        """Internal method to call client.file.get_file"""
 85
 86        return self._client.file.get_file(self._file)
 87
 88    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 89        """
 90        Parse the file as a Pandas DataFrame.
 91
 92        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 93
 94        File compression is inferred from the extension, but can be set
 95        explicitly with the compression= flag.
 96
 97        All other keyword arguments are passed to pandas.read_csv
 98        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
 99        """
100
101        if compression == 'infer':
102            # If the file appears to be compressed
103            if self.relative_path.endswith('.gz'):
104                compression = dict(method='gzip')
105            elif self.relative_path.endswith('.bz2'):
106                compression = dict(method='bz2')
107            elif self.relative_path.endswith('.xz'):
108                compression = dict(method='zstd')
109            elif self.relative_path.endswith('.zst'):
110                compression = dict(method='zstd')
111            else:
112                compression = None
113
114        if compression is not None:
115            handle = BytesIO(self._get())
116        else:
117            handle = StringIO(self._get().decode(encoding))
118
119        df = pd.read_csv(
120            handle,
121            compression=compression,
122            encoding=encoding,
123            **kwargs
124        )
125        handle.close()
126        return df
127
128    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
129        """Read the file contents as a list of lines."""
130
131        return self.read(
132            encoding=encoding,
133            compression=compression
134        ).splitlines()
135
136    def read(self, encoding='utf-8', compression=None) -> str:
137        """Read the file contents as text."""
138
139        # Get the raw file contents
140        cont = self._get()
141
142        # If the file is uncompressed
143        if compression is None:
144            return cont.decode(encoding)
145        # If the file is compressed
146        else:
147
148            # Only gzip-compression is supported currently
149            if compression != "gzip":
150                raise DataPortalInputError("compression may be 'gzip' or None")
151
152            with gzip.open(
153                BytesIO(
154                    cont
155                ),
156                'rt',
157                encoding=encoding
158            ) as handle:
159                return handle.read()
160
161    def download(self, download_location: str = None):
162        """Download the file to a local directory."""
163
164        if download_location is None:
165            raise DataPortalInputError("Must provide download location")
166
167        self._client.file.download_files(
168            self._file.access_context,
169            download_location,
170            [self.relative_path]
171        )

Datasets are made up of a collection of File objects in the Data Portal.

DataPortalFile(file: cirro.models.file.File, client: cirro.CirroApi) View Source

20    def __init__(self, file: File, client: CirroApi):
21        """
22        Instantiate by listing files from a dataset.
23
24        ```python
25        from cirro import DataPortal()
26        portal = DataPortal()
27        dataset = portal.get_dataset(
28            project="id-or-name-of-project",
29            dataset="id-or-name-of-dataset"
30        )
31        files = dataset.list_files()
32        ```
33        """
34        # Attach the file object
35        self._file = file
36        self._client = client

Instantiate by listing files from a dataset.

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
files = dataset.list_files()

id: str View Source

40    @property
41    def id(self) -> str:
42        """Relative path of file within the dataset"""
43        return self._file.relative_path

Relative path of file within the dataset

name: str View Source

45    @property
46    def name(self) -> str:
47        """Relative path of file within the dataset"""
48        return self._file.relative_path

Relative path of file within the dataset

file_name: str View Source

50    @property
51    def file_name(self) -> str:
52        """Name of file, excluding the full folder path within the dataset"""
53        return self._file.name

Name of file, excluding the full folder path within the dataset

relative_path: str View Source

55    @property
56    def relative_path(self) -> str:
57        """Relative path of file within the dataset"""
58        return self._file.relative_path

Relative path of file within the dataset

absolute_path: str View Source

60    @property
61    def absolute_path(self) -> str:
62        """Fully URI to file object in AWS S3"""
63        return self._file.absolute_path

Fully URI to file object in AWS S3

metadata: dict View Source

65    @property
66    def metadata(self) -> dict:
67        """File metadata"""
68        return self._file.metadata

File metadata

size_bytes: int View Source

70    @property
71    def size_bytes(self) -> int:
72        """File size (in bytes)"""
73        return self._file.size

File size (in bytes)

size: str View Source

75    @property
76    def size(self) -> str:
77        """File size converted to human-readable (e.g., 4.50 GB)"""
78        return convert_size(self._file.size)

File size converted to human-readable (e.g., 4.50 GB)

def read_csv( self, compression='infer', encoding='utf-8', **kwargs) -> pandas.core.frame.DataFrame: View Source

 88    def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame:
 89        """
 90        Parse the file as a Pandas DataFrame.
 91
 92        The default field separator is a comma (for CSV), use sep='\\t' for TSV.
 93
 94        File compression is inferred from the extension, but can be set
 95        explicitly with the compression= flag.
 96
 97        All other keyword arguments are passed to pandas.read_csv
 98        https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
 99        """
100
101        if compression == 'infer':
102            # If the file appears to be compressed
103            if self.relative_path.endswith('.gz'):
104                compression = dict(method='gzip')
105            elif self.relative_path.endswith('.bz2'):
106                compression = dict(method='bz2')
107            elif self.relative_path.endswith('.xz'):
108                compression = dict(method='zstd')
109            elif self.relative_path.endswith('.zst'):
110                compression = dict(method='zstd')
111            else:
112                compression = None
113
114        if compression is not None:
115            handle = BytesIO(self._get())
116        else:
117            handle = StringIO(self._get().decode(encoding))
118
119        df = pd.read_csv(
120            handle,
121            compression=compression,
122            encoding=encoding,
123            **kwargs
124        )
125        handle.close()
126        return df

Parse the file as a Pandas DataFrame.

The default field separator is a comma (for CSV), use sep='\t' for TSV.

File compression is inferred from the extension, but can be set explicitly with the compression= flag.

All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

def readlines(self, encoding='utf-8', compression=None) -> List[str]: View Source

128    def readlines(self, encoding='utf-8', compression=None) -> List[str]:
129        """Read the file contents as a list of lines."""
130
131        return self.read(
132            encoding=encoding,
133            compression=compression
134        ).splitlines()

Read the file contents as a list of lines.

def read(self, encoding='utf-8', compression=None) -> str: View Source

136    def read(self, encoding='utf-8', compression=None) -> str:
137        """Read the file contents as text."""
138
139        # Get the raw file contents
140        cont = self._get()
141
142        # If the file is uncompressed
143        if compression is None:
144            return cont.decode(encoding)
145        # If the file is compressed
146        else:
147
148            # Only gzip-compression is supported currently
149            if compression != "gzip":
150                raise DataPortalInputError("compression may be 'gzip' or None")
151
152            with gzip.open(
153                BytesIO(
154                    cont
155                ),
156                'rt',
157                encoding=encoding
158            ) as handle:
159                return handle.read()

Read the file contents as text.

def download(self, download_location: str = None): View Source

161    def download(self, download_location: str = None):
162        """Download the file to a local directory."""
163
164        if download_location is None:
165            raise DataPortalInputError("Must provide download location")
166
167        self._client.file.download_files(
168            self._file.access_context,
169            download_location,
170            [self.relative_path]
171        )

Download the file to a local directory.

class DataPortalFiles(cirro.sdk.asset.DataPortalAssets[cirro.sdk.file.DataPortalFile]): View Source

174class DataPortalFiles(DataPortalAssets[DataPortalFile]):
175    """Collection of DataPortalFile objects."""
176    asset_name = "file"
177
178    def download(self, download_location: str = None) -> None:
179        """Download the collection of files to a local directory."""
180
181        for f in self:
182            f.download(download_location)

Collection of DataPortalFile objects.

asset_name = 'file'

def download(self, download_location: str = None) -> None: View Source

178    def download(self, download_location: str = None) -> None:
179        """Download the collection of files to a local directory."""
180
181        for f in self:
182            f.download(download_location)

Download the collection of files to a local directory.

Inherited Members

cirro.sdk.asset.DataPortalAssets: DataPortalAssets; description; get_by_name; get_by_id; filter_by_pattern
builtins.list: clear; copy; append; insert; extend; pop; remove; index; count; reverse; sort