cirro.sdk.file
1import gzip 2from io import BytesIO, StringIO 3from typing import List 4 5import pandas as pd 6 7from cirro.cirro_client import CirroApi 8from cirro.models.file import File 9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset 10from cirro.sdk.exceptions import DataPortalInputError 11from cirro.utils import convert_size 12 13 14class DataPortalFile(DataPortalAsset): 15 """ 16 Datasets are made up of a collection of File objects in the Data Portal. 17 """ 18 19 def __init__(self, file: File, client: CirroApi): 20 """ 21 Instantiate by listing files from a dataset. 22 23 ```python 24 from cirro import DataPortal() 25 portal = DataPortal() 26 dataset = portal.get_dataset( 27 project="id-or-name-of-project", 28 dataset="id-or-name-of-dataset" 29 ) 30 files = dataset.list_files() 31 ``` 32 """ 33 # Attach the file object 34 self._file = file 35 self._client = client 36 37 # Note that the 'name' and 'id' attributes are set to the relative path 38 # The purpose of this is to support the DataPortalAssets class functions 39 @property 40 def id(self) -> str: 41 """Relative path of file within the dataset""" 42 return self._file.relative_path 43 44 @property 45 def name(self) -> str: 46 """Relative path of file within the dataset""" 47 return self._file.relative_path 48 49 @property 50 def file_name(self) -> str: 51 """Name of file, excluding the full folder path within the dataset""" 52 return self._file.name 53 54 @property 55 def relative_path(self) -> str: 56 """Relative path of file within the dataset""" 57 return self._file.relative_path 58 59 @property 60 def absolute_path(self) -> str: 61 """Fully URI to file object in AWS S3""" 62 return self._file.absolute_path 63 64 @property 65 def metadata(self) -> dict: 66 """File metadata""" 67 return self._file.metadata 68 69 @property 70 def size_bytes(self) -> int: 71 """File size (in bytes)""" 72 return self._file.size 73 74 @property 75 def size(self) -> str: 76 """File size converted to human-readable (e.g., 4.50 GB)""" 77 return convert_size(self._file.size) 78 79 def __str__(self): 80 return f"{self.relative_path} ({self.size})" 81 82 def _get(self) -> bytes: 83 """Internal method to call client.file.get_file""" 84 85 return self._client.file.get_file(self._file) 86 87 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 88 """ 89 Parse the file as a Pandas DataFrame. 90 91 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 92 93 File compression is inferred from the extension, but can be set 94 explicitly with the compression= flag. 95 96 All other keyword arguments are passed to pandas.read_csv 97 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 98 """ 99 100 if compression == 'infer': 101 # If the file appears to be compressed 102 if self.relative_path.endswith('.gz'): 103 compression = dict(method='gzip') 104 elif self.relative_path.endswith('.bz2'): 105 compression = dict(method='bz2') 106 elif self.relative_path.endswith('.xz'): 107 compression = dict(method='zstd') 108 elif self.relative_path.endswith('.zst'): 109 compression = dict(method='zstd') 110 else: 111 compression = None 112 113 if compression is not None: 114 handle = BytesIO(self._get()) 115 else: 116 handle = StringIO(self._get().decode(encoding)) 117 118 df = pd.read_csv( 119 handle, 120 compression=compression, 121 encoding=encoding, 122 **kwargs 123 ) 124 handle.close() 125 return df 126 127 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 128 """Read the file contents as a list of lines.""" 129 130 return self.read( 131 encoding=encoding, 132 compression=compression 133 ).splitlines() 134 135 def read(self, encoding='utf-8', compression=None) -> str: 136 """Read the file contents as text.""" 137 138 # Get the raw file contents 139 cont = self._get() 140 141 # If the file is uncompressed 142 if compression is None: 143 return cont.decode(encoding) 144 # If the file is compressed 145 else: 146 147 # Only gzip-compression is supported currently 148 if compression != "gzip": 149 raise DataPortalInputError("compression may be 'gzip' or None") 150 151 with gzip.open( 152 BytesIO( 153 cont 154 ), 155 'rt', 156 encoding=encoding 157 ) as handle: 158 return handle.read() 159 160 def download(self, download_location: str = None): 161 """Download the file to a local directory.""" 162 163 if download_location is None: 164 raise DataPortalInputError("Must provide download location") 165 166 self._client.file.download_files( 167 self._file.access_context, 168 download_location, 169 [self.relative_path] 170 ) 171 172 173class DataPortalFiles(DataPortalAssets[DataPortalFile]): 174 """Collection of DataPortalFile objects.""" 175 asset_name = "file" 176 177 def download(self, download_location: str = None) -> None: 178 """Download the collection of files to a local directory.""" 179 180 for f in self: 181 f.download(download_location)
15class DataPortalFile(DataPortalAsset): 16 """ 17 Datasets are made up of a collection of File objects in the Data Portal. 18 """ 19 20 def __init__(self, file: File, client: CirroApi): 21 """ 22 Instantiate by listing files from a dataset. 23 24 ```python 25 from cirro import DataPortal() 26 portal = DataPortal() 27 dataset = portal.get_dataset( 28 project="id-or-name-of-project", 29 dataset="id-or-name-of-dataset" 30 ) 31 files = dataset.list_files() 32 ``` 33 """ 34 # Attach the file object 35 self._file = file 36 self._client = client 37 38 # Note that the 'name' and 'id' attributes are set to the relative path 39 # The purpose of this is to support the DataPortalAssets class functions 40 @property 41 def id(self) -> str: 42 """Relative path of file within the dataset""" 43 return self._file.relative_path 44 45 @property 46 def name(self) -> str: 47 """Relative path of file within the dataset""" 48 return self._file.relative_path 49 50 @property 51 def file_name(self) -> str: 52 """Name of file, excluding the full folder path within the dataset""" 53 return self._file.name 54 55 @property 56 def relative_path(self) -> str: 57 """Relative path of file within the dataset""" 58 return self._file.relative_path 59 60 @property 61 def absolute_path(self) -> str: 62 """Fully URI to file object in AWS S3""" 63 return self._file.absolute_path 64 65 @property 66 def metadata(self) -> dict: 67 """File metadata""" 68 return self._file.metadata 69 70 @property 71 def size_bytes(self) -> int: 72 """File size (in bytes)""" 73 return self._file.size 74 75 @property 76 def size(self) -> str: 77 """File size converted to human-readable (e.g., 4.50 GB)""" 78 return convert_size(self._file.size) 79 80 def __str__(self): 81 return f"{self.relative_path} ({self.size})" 82 83 def _get(self) -> bytes: 84 """Internal method to call client.file.get_file""" 85 86 return self._client.file.get_file(self._file) 87 88 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 89 """ 90 Parse the file as a Pandas DataFrame. 91 92 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 93 94 File compression is inferred from the extension, but can be set 95 explicitly with the compression= flag. 96 97 All other keyword arguments are passed to pandas.read_csv 98 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 99 """ 100 101 if compression == 'infer': 102 # If the file appears to be compressed 103 if self.relative_path.endswith('.gz'): 104 compression = dict(method='gzip') 105 elif self.relative_path.endswith('.bz2'): 106 compression = dict(method='bz2') 107 elif self.relative_path.endswith('.xz'): 108 compression = dict(method='zstd') 109 elif self.relative_path.endswith('.zst'): 110 compression = dict(method='zstd') 111 else: 112 compression = None 113 114 if compression is not None: 115 handle = BytesIO(self._get()) 116 else: 117 handle = StringIO(self._get().decode(encoding)) 118 119 df = pd.read_csv( 120 handle, 121 compression=compression, 122 encoding=encoding, 123 **kwargs 124 ) 125 handle.close() 126 return df 127 128 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 129 """Read the file contents as a list of lines.""" 130 131 return self.read( 132 encoding=encoding, 133 compression=compression 134 ).splitlines() 135 136 def read(self, encoding='utf-8', compression=None) -> str: 137 """Read the file contents as text.""" 138 139 # Get the raw file contents 140 cont = self._get() 141 142 # If the file is uncompressed 143 if compression is None: 144 return cont.decode(encoding) 145 # If the file is compressed 146 else: 147 148 # Only gzip-compression is supported currently 149 if compression != "gzip": 150 raise DataPortalInputError("compression may be 'gzip' or None") 151 152 with gzip.open( 153 BytesIO( 154 cont 155 ), 156 'rt', 157 encoding=encoding 158 ) as handle: 159 return handle.read() 160 161 def download(self, download_location: str = None): 162 """Download the file to a local directory.""" 163 164 if download_location is None: 165 raise DataPortalInputError("Must provide download location") 166 167 self._client.file.download_files( 168 self._file.access_context, 169 download_location, 170 [self.relative_path] 171 )
Datasets are made up of a collection of File objects in the Data Portal.
20 def __init__(self, file: File, client: CirroApi): 21 """ 22 Instantiate by listing files from a dataset. 23 24 ```python 25 from cirro import DataPortal() 26 portal = DataPortal() 27 dataset = portal.get_dataset( 28 project="id-or-name-of-project", 29 dataset="id-or-name-of-dataset" 30 ) 31 files = dataset.list_files() 32 ``` 33 """ 34 # Attach the file object 35 self._file = file 36 self._client = client
Instantiate by listing files from a dataset.
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
files = dataset.list_files()
40 @property 41 def id(self) -> str: 42 """Relative path of file within the dataset""" 43 return self._file.relative_path
Relative path of file within the dataset
45 @property 46 def name(self) -> str: 47 """Relative path of file within the dataset""" 48 return self._file.relative_path
Relative path of file within the dataset
50 @property 51 def file_name(self) -> str: 52 """Name of file, excluding the full folder path within the dataset""" 53 return self._file.name
Name of file, excluding the full folder path within the dataset
55 @property 56 def relative_path(self) -> str: 57 """Relative path of file within the dataset""" 58 return self._file.relative_path
Relative path of file within the dataset
60 @property 61 def absolute_path(self) -> str: 62 """Fully URI to file object in AWS S3""" 63 return self._file.absolute_path
Fully URI to file object in AWS S3
70 @property 71 def size_bytes(self) -> int: 72 """File size (in bytes)""" 73 return self._file.size
File size (in bytes)
75 @property 76 def size(self) -> str: 77 """File size converted to human-readable (e.g., 4.50 GB)""" 78 return convert_size(self._file.size)
File size converted to human-readable (e.g., 4.50 GB)
88 def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> pd.DataFrame: 89 """ 90 Parse the file as a Pandas DataFrame. 91 92 The default field separator is a comma (for CSV), use sep='\\t' for TSV. 93 94 File compression is inferred from the extension, but can be set 95 explicitly with the compression= flag. 96 97 All other keyword arguments are passed to pandas.read_csv 98 https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html 99 """ 100 101 if compression == 'infer': 102 # If the file appears to be compressed 103 if self.relative_path.endswith('.gz'): 104 compression = dict(method='gzip') 105 elif self.relative_path.endswith('.bz2'): 106 compression = dict(method='bz2') 107 elif self.relative_path.endswith('.xz'): 108 compression = dict(method='zstd') 109 elif self.relative_path.endswith('.zst'): 110 compression = dict(method='zstd') 111 else: 112 compression = None 113 114 if compression is not None: 115 handle = BytesIO(self._get()) 116 else: 117 handle = StringIO(self._get().decode(encoding)) 118 119 df = pd.read_csv( 120 handle, 121 compression=compression, 122 encoding=encoding, 123 **kwargs 124 ) 125 handle.close() 126 return df
Parse the file as a Pandas DataFrame.
The default field separator is a comma (for CSV), use sep='\t' for TSV.
File compression is inferred from the extension, but can be set explicitly with the compression= flag.
All other keyword arguments are passed to pandas.read_csv https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
128 def readlines(self, encoding='utf-8', compression=None) -> List[str]: 129 """Read the file contents as a list of lines.""" 130 131 return self.read( 132 encoding=encoding, 133 compression=compression 134 ).splitlines()
Read the file contents as a list of lines.
136 def read(self, encoding='utf-8', compression=None) -> str: 137 """Read the file contents as text.""" 138 139 # Get the raw file contents 140 cont = self._get() 141 142 # If the file is uncompressed 143 if compression is None: 144 return cont.decode(encoding) 145 # If the file is compressed 146 else: 147 148 # Only gzip-compression is supported currently 149 if compression != "gzip": 150 raise DataPortalInputError("compression may be 'gzip' or None") 151 152 with gzip.open( 153 BytesIO( 154 cont 155 ), 156 'rt', 157 encoding=encoding 158 ) as handle: 159 return handle.read()
Read the file contents as text.
161 def download(self, download_location: str = None): 162 """Download the file to a local directory.""" 163 164 if download_location is None: 165 raise DataPortalInputError("Must provide download location") 166 167 self._client.file.download_files( 168 self._file.access_context, 169 download_location, 170 [self.relative_path] 171 )
Download the file to a local directory.
174class DataPortalFiles(DataPortalAssets[DataPortalFile]): 175 """Collection of DataPortalFile objects.""" 176 asset_name = "file" 177 178 def download(self, download_location: str = None) -> None: 179 """Download the collection of files to a local directory.""" 180 181 for f in self: 182 f.download(download_location)
Collection of DataPortalFile objects.
178 def download(self, download_location: str = None) -> None: 179 """Download the collection of files to a local directory.""" 180 181 for f in self: 182 f.download(download_location)
Download the collection of files to a local directory.
Inherited Members
- cirro.sdk.asset.DataPortalAssets
- DataPortalAssets
- description
- get_by_name
- get_by_id
- filter_by_pattern
- builtins.list
- clear
- copy
- append
- insert
- extend
- pop
- remove
- index
- count
- reverse
- sort