cirro.sdk.dataset
1import datetime 2from typing import Union, List, Optional 3 4from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, FileEntry, \ 5 ProcessDetail, Status, DatasetDetailParams, RunAnalysisRequestParams, DatasetDetailInfo, \ 6 Tag 7 8from cirro.cirro_client import CirroApi 9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset 10from cirro.sdk.exceptions import DataPortalInputError 11from cirro.sdk.file import DataPortalFile, DataPortalFiles 12from cirro.sdk.helpers import parse_process_name_or_id 13from cirro.sdk.process import DataPortalProcess 14 15 16class DataPortalDataset(DataPortalAsset): 17 """ 18 Datasets in the Data Portal are collections of files which have 19 either been uploaded directly, or which have been output by 20 an analysis pipeline or notebook. 21 """ 22 23 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 24 """ 25 Instantiate a dataset object 26 27 Should be invoked from a top-level constructor, for example: 28 29 ```python 30 from cirro import DataPortal() 31 portal = DataPortal() 32 dataset = portal.get_dataset( 33 project="id-or-name-of-project", 34 dataset="id-or-name-of-dataset" 35 ) 36 ``` 37 38 """ 39 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 40 self._data = dataset 41 self._files: Optional[List[FileEntry]] = None 42 self._client = client 43 44 @property 45 def id(self) -> str: 46 """Unique identifier for the dataset""" 47 return self._data.id 48 49 @property 50 def name(self) -> str: 51 """Editible name for the dataset""" 52 return self._data.name 53 54 @property 55 def description(self) -> str: 56 """Longer name for the dataset""" 57 return self._data.description 58 59 @property 60 def process_id(self) -> str: 61 """Unique ID of process used to create the dataset""" 62 return self._data.process_id 63 64 @property 65 def process(self) -> ProcessDetail: 66 """ 67 Object representing the process used to create the dataset 68 """ 69 return self._client.processes.get(self.process_id) 70 71 @property 72 def project_id(self) -> str: 73 """ID of the project containing the dataset""" 74 return self._data.project_id 75 76 @property 77 def status(self) -> Status: 78 """ 79 Status of the dataset 80 """ 81 return self._data.status 82 83 @property 84 def source_dataset_ids(self) -> List[str]: 85 """IDs of the datasets used as sources for this dataset (if any)""" 86 return self._data.source_dataset_ids 87 88 @property 89 def source_datasets(self) -> List['DataPortalDataset']: 90 """ 91 Objects representing the datasets used as sources for this dataset (if any) 92 """ 93 return [ 94 DataPortalDataset( 95 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 96 client=self._client 97 ) 98 for dataset_id in self.source_dataset_ids 99 ] 100 101 @property 102 def params(self) -> DatasetDetailParams: 103 """ 104 Parameters used to generate the dataset 105 """ 106 return self._get_detail().params 107 108 @property 109 def info(self) -> DatasetDetailInfo: 110 """ 111 Detailed information about the dataset 112 """ 113 return self._get_detail().info 114 115 @property 116 def tags(self) -> List[Tag]: 117 """ 118 Tags applied to the dataset 119 """ 120 return self._data.tags 121 122 @property 123 def created_by(self) -> str: 124 """User who created the dataset""" 125 return self._data.created_by 126 127 @property 128 def created_at(self) -> datetime.datetime: 129 """Timestamp of dataset creation""" 130 return self._data.created_at 131 132 def _get_detail(self): 133 if not isinstance(self._data, DatasetDetail): 134 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 135 return self._data 136 137 def __str__(self): 138 return '\n'.join([ 139 f"{i.title()}: {self.__getattribute__(i)}" 140 for i in ['name', 'id', 'description', 'status'] 141 ]) 142 143 def list_files(self) -> DataPortalFiles: 144 """ 145 Return the list of files which make up the dataset. 146 """ 147 if not self._files: 148 self._files = DataPortalFiles( 149 [ 150 DataPortalFile(file=file, client=self._client) 151 for file in self._client.datasets.get_file_listing( 152 project_id=self.project_id, 153 dataset_id=self.id 154 ) 155 ] 156 ) 157 return self._files 158 159 def download_files(self, download_location: str = None) -> None: 160 """ 161 Download all the files from the dataset to a local directory. 162 163 Args: 164 download_location (str): Path to local directory 165 """ 166 167 # Alias for internal method 168 self.list_files().download(download_location) 169 170 def run_analysis( 171 self, 172 name: str = None, 173 description: str = "", 174 process: Union[DataPortalProcess, str] = None, 175 params=None, 176 notifications_emails=None 177 ) -> str: 178 """ 179 Runs an analysis on a dataset, returns the ID of the newly created dataset. 180 181 The process can be provided as either a DataPortalProcess object, 182 or a string which corresponds to the name or ID of the process. 183 184 Args: 185 name (str): Name of newly created dataset 186 description (str): Description of newly created dataset 187 process (DataPortalProcess or str): Process to run 188 params (dict): Analysis parameters 189 notifications_emails (List[str]): Notification email address(es) 190 191 Returns: 192 dataset_id (str): ID of newly created dataset 193 """ 194 if name is None: 195 raise DataPortalInputError("Must specify 'name' for run_analysis") 196 if process is None: 197 raise DataPortalInputError("Must specify 'process' for run_analysis") 198 if notifications_emails is None: 199 notifications_emails = [] 200 if params is None: 201 params = {} 202 203 # If the process is a string, try to parse it as a process name or ID 204 process = parse_process_name_or_id(process, self._client) 205 206 resp = self._client.execution.run_analysis( 207 project_id=self.project_id, 208 request=RunAnalysisRequest( 209 name=name, 210 description=description, 211 process_id=process.id, 212 source_dataset_ids=[self.id], 213 params=RunAnalysisRequestParams.from_dict(params), 214 notification_emails=notifications_emails 215 ) 216 ) 217 return resp.id 218 219 220class DataPortalDatasets(DataPortalAssets[DataPortalDataset]): 221 """Collection of multiple DataPortalDataset objects.""" 222 asset_name = "dataset"
17class DataPortalDataset(DataPortalAsset): 18 """ 19 Datasets in the Data Portal are collections of files which have 20 either been uploaded directly, or which have been output by 21 an analysis pipeline or notebook. 22 """ 23 24 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 25 """ 26 Instantiate a dataset object 27 28 Should be invoked from a top-level constructor, for example: 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 dataset = portal.get_dataset( 34 project="id-or-name-of-project", 35 dataset="id-or-name-of-dataset" 36 ) 37 ``` 38 39 """ 40 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 41 self._data = dataset 42 self._files: Optional[List[FileEntry]] = None 43 self._client = client 44 45 @property 46 def id(self) -> str: 47 """Unique identifier for the dataset""" 48 return self._data.id 49 50 @property 51 def name(self) -> str: 52 """Editible name for the dataset""" 53 return self._data.name 54 55 @property 56 def description(self) -> str: 57 """Longer name for the dataset""" 58 return self._data.description 59 60 @property 61 def process_id(self) -> str: 62 """Unique ID of process used to create the dataset""" 63 return self._data.process_id 64 65 @property 66 def process(self) -> ProcessDetail: 67 """ 68 Object representing the process used to create the dataset 69 """ 70 return self._client.processes.get(self.process_id) 71 72 @property 73 def project_id(self) -> str: 74 """ID of the project containing the dataset""" 75 return self._data.project_id 76 77 @property 78 def status(self) -> Status: 79 """ 80 Status of the dataset 81 """ 82 return self._data.status 83 84 @property 85 def source_dataset_ids(self) -> List[str]: 86 """IDs of the datasets used as sources for this dataset (if any)""" 87 return self._data.source_dataset_ids 88 89 @property 90 def source_datasets(self) -> List['DataPortalDataset']: 91 """ 92 Objects representing the datasets used as sources for this dataset (if any) 93 """ 94 return [ 95 DataPortalDataset( 96 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 97 client=self._client 98 ) 99 for dataset_id in self.source_dataset_ids 100 ] 101 102 @property 103 def params(self) -> DatasetDetailParams: 104 """ 105 Parameters used to generate the dataset 106 """ 107 return self._get_detail().params 108 109 @property 110 def info(self) -> DatasetDetailInfo: 111 """ 112 Detailed information about the dataset 113 """ 114 return self._get_detail().info 115 116 @property 117 def tags(self) -> List[Tag]: 118 """ 119 Tags applied to the dataset 120 """ 121 return self._data.tags 122 123 @property 124 def created_by(self) -> str: 125 """User who created the dataset""" 126 return self._data.created_by 127 128 @property 129 def created_at(self) -> datetime.datetime: 130 """Timestamp of dataset creation""" 131 return self._data.created_at 132 133 def _get_detail(self): 134 if not isinstance(self._data, DatasetDetail): 135 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 136 return self._data 137 138 def __str__(self): 139 return '\n'.join([ 140 f"{i.title()}: {self.__getattribute__(i)}" 141 for i in ['name', 'id', 'description', 'status'] 142 ]) 143 144 def list_files(self) -> DataPortalFiles: 145 """ 146 Return the list of files which make up the dataset. 147 """ 148 if not self._files: 149 self._files = DataPortalFiles( 150 [ 151 DataPortalFile(file=file, client=self._client) 152 for file in self._client.datasets.get_file_listing( 153 project_id=self.project_id, 154 dataset_id=self.id 155 ) 156 ] 157 ) 158 return self._files 159 160 def download_files(self, download_location: str = None) -> None: 161 """ 162 Download all the files from the dataset to a local directory. 163 164 Args: 165 download_location (str): Path to local directory 166 """ 167 168 # Alias for internal method 169 self.list_files().download(download_location) 170 171 def run_analysis( 172 self, 173 name: str = None, 174 description: str = "", 175 process: Union[DataPortalProcess, str] = None, 176 params=None, 177 notifications_emails=None 178 ) -> str: 179 """ 180 Runs an analysis on a dataset, returns the ID of the newly created dataset. 181 182 The process can be provided as either a DataPortalProcess object, 183 or a string which corresponds to the name or ID of the process. 184 185 Args: 186 name (str): Name of newly created dataset 187 description (str): Description of newly created dataset 188 process (DataPortalProcess or str): Process to run 189 params (dict): Analysis parameters 190 notifications_emails (List[str]): Notification email address(es) 191 192 Returns: 193 dataset_id (str): ID of newly created dataset 194 """ 195 if name is None: 196 raise DataPortalInputError("Must specify 'name' for run_analysis") 197 if process is None: 198 raise DataPortalInputError("Must specify 'process' for run_analysis") 199 if notifications_emails is None: 200 notifications_emails = [] 201 if params is None: 202 params = {} 203 204 # If the process is a string, try to parse it as a process name or ID 205 process = parse_process_name_or_id(process, self._client) 206 207 resp = self._client.execution.run_analysis( 208 project_id=self.project_id, 209 request=RunAnalysisRequest( 210 name=name, 211 description=description, 212 process_id=process.id, 213 source_dataset_ids=[self.id], 214 params=RunAnalysisRequestParams.from_dict(params), 215 notification_emails=notifications_emails 216 ) 217 ) 218 return resp.id
Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.
24 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 25 """ 26 Instantiate a dataset object 27 28 Should be invoked from a top-level constructor, for example: 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 dataset = portal.get_dataset( 34 project="id-or-name-of-project", 35 dataset="id-or-name-of-dataset" 36 ) 37 ``` 38 39 """ 40 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 41 self._data = dataset 42 self._files: Optional[List[FileEntry]] = None 43 self._client = client
Instantiate a dataset object
Should be invoked from a top-level constructor, for example:
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
45 @property 46 def id(self) -> str: 47 """Unique identifier for the dataset""" 48 return self._data.id
Unique identifier for the dataset
50 @property 51 def name(self) -> str: 52 """Editible name for the dataset""" 53 return self._data.name
Editible name for the dataset
55 @property 56 def description(self) -> str: 57 """Longer name for the dataset""" 58 return self._data.description
Longer name for the dataset
60 @property 61 def process_id(self) -> str: 62 """Unique ID of process used to create the dataset""" 63 return self._data.process_id
Unique ID of process used to create the dataset
65 @property 66 def process(self) -> ProcessDetail: 67 """ 68 Object representing the process used to create the dataset 69 """ 70 return self._client.processes.get(self.process_id)
Object representing the process used to create the dataset
72 @property 73 def project_id(self) -> str: 74 """ID of the project containing the dataset""" 75 return self._data.project_id
ID of the project containing the dataset
77 @property 78 def status(self) -> Status: 79 """ 80 Status of the dataset 81 """ 82 return self._data.status
Status of the dataset
84 @property 85 def source_dataset_ids(self) -> List[str]: 86 """IDs of the datasets used as sources for this dataset (if any)""" 87 return self._data.source_dataset_ids
IDs of the datasets used as sources for this dataset (if any)
89 @property 90 def source_datasets(self) -> List['DataPortalDataset']: 91 """ 92 Objects representing the datasets used as sources for this dataset (if any) 93 """ 94 return [ 95 DataPortalDataset( 96 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 97 client=self._client 98 ) 99 for dataset_id in self.source_dataset_ids 100 ]
Objects representing the datasets used as sources for this dataset (if any)
102 @property 103 def params(self) -> DatasetDetailParams: 104 """ 105 Parameters used to generate the dataset 106 """ 107 return self._get_detail().params
Parameters used to generate the dataset
109 @property 110 def info(self) -> DatasetDetailInfo: 111 """ 112 Detailed information about the dataset 113 """ 114 return self._get_detail().info
Detailed information about the dataset
123 @property 124 def created_by(self) -> str: 125 """User who created the dataset""" 126 return self._data.created_by
User who created the dataset
128 @property 129 def created_at(self) -> datetime.datetime: 130 """Timestamp of dataset creation""" 131 return self._data.created_at
Timestamp of dataset creation
144 def list_files(self) -> DataPortalFiles: 145 """ 146 Return the list of files which make up the dataset. 147 """ 148 if not self._files: 149 self._files = DataPortalFiles( 150 [ 151 DataPortalFile(file=file, client=self._client) 152 for file in self._client.datasets.get_file_listing( 153 project_id=self.project_id, 154 dataset_id=self.id 155 ) 156 ] 157 ) 158 return self._files
Return the list of files which make up the dataset.
160 def download_files(self, download_location: str = None) -> None: 161 """ 162 Download all the files from the dataset to a local directory. 163 164 Args: 165 download_location (str): Path to local directory 166 """ 167 168 # Alias for internal method 169 self.list_files().download(download_location)
Download all the files from the dataset to a local directory.
Arguments:
- download_location (str): Path to local directory
171 def run_analysis( 172 self, 173 name: str = None, 174 description: str = "", 175 process: Union[DataPortalProcess, str] = None, 176 params=None, 177 notifications_emails=None 178 ) -> str: 179 """ 180 Runs an analysis on a dataset, returns the ID of the newly created dataset. 181 182 The process can be provided as either a DataPortalProcess object, 183 or a string which corresponds to the name or ID of the process. 184 185 Args: 186 name (str): Name of newly created dataset 187 description (str): Description of newly created dataset 188 process (DataPortalProcess or str): Process to run 189 params (dict): Analysis parameters 190 notifications_emails (List[str]): Notification email address(es) 191 192 Returns: 193 dataset_id (str): ID of newly created dataset 194 """ 195 if name is None: 196 raise DataPortalInputError("Must specify 'name' for run_analysis") 197 if process is None: 198 raise DataPortalInputError("Must specify 'process' for run_analysis") 199 if notifications_emails is None: 200 notifications_emails = [] 201 if params is None: 202 params = {} 203 204 # If the process is a string, try to parse it as a process name or ID 205 process = parse_process_name_or_id(process, self._client) 206 207 resp = self._client.execution.run_analysis( 208 project_id=self.project_id, 209 request=RunAnalysisRequest( 210 name=name, 211 description=description, 212 process_id=process.id, 213 source_dataset_ids=[self.id], 214 params=RunAnalysisRequestParams.from_dict(params), 215 notification_emails=notifications_emails 216 ) 217 ) 218 return resp.id
Runs an analysis on a dataset, returns the ID of the newly created dataset.
The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (DataPortalProcess or str): Process to run
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
Returns:
dataset_id (str): ID of newly created dataset
221class DataPortalDatasets(DataPortalAssets[DataPortalDataset]): 222 """Collection of multiple DataPortalDataset objects.""" 223 asset_name = "dataset"
Collection of multiple DataPortalDataset objects.
Inherited Members
- cirro.sdk.asset.DataPortalAssets
- DataPortalAssets
- description
- get_by_name
- get_by_id
- filter_by_pattern
- builtins.list
- clear
- copy
- append
- insert
- extend
- pop
- remove
- index
- count
- reverse
- sort