cirro.sdk.dataset
1import datetime 2from typing import Union, List, Optional 3 4from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ 5 RunAnalysisRequestParams, Tag, ArtifactType, NamedItem 6 7from cirro.cirro_client import CirroApi 8from cirro.models.assets import DatasetAssets 9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset 10from cirro.sdk.exceptions import DataPortalAssetNotFound 11from cirro.sdk.exceptions import DataPortalInputError 12from cirro.sdk.file import DataPortalFile, DataPortalFiles 13from cirro.sdk.helpers import parse_process_name_or_id 14from cirro.sdk.process import DataPortalProcess 15 16 17class DataPortalDataset(DataPortalAsset): 18 """ 19 Datasets in the Data Portal are collections of files which have 20 either been uploaded directly, or which have been output by 21 an analysis pipeline or notebook. 22 """ 23 24 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 25 """ 26 Instantiate a dataset object 27 28 Should be invoked from a top-level constructor, for example: 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 dataset = portal.get_dataset( 34 project="id-or-name-of-project", 35 dataset="id-or-name-of-dataset" 36 ) 37 ``` 38 39 """ 40 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 41 self._data = dataset 42 self._assets: Optional[DatasetAssets] = None 43 self._client = client 44 45 @property 46 def id(self) -> str: 47 """Unique identifier for the dataset""" 48 return self._data.id 49 50 @property 51 def name(self) -> str: 52 """Editable name for the dataset""" 53 return self._data.name 54 55 @property 56 def description(self) -> str: 57 """Longer name for the dataset""" 58 return self._data.description 59 60 @property 61 def process_id(self) -> str: 62 """Unique ID of process used to create the dataset""" 63 return self._data.process_id 64 65 @property 66 def process(self) -> ProcessDetail: 67 """ 68 Object representing the process used to create the dataset 69 """ 70 return self._client.processes.get(self.process_id) 71 72 @property 73 def project_id(self) -> str: 74 """ID of the project containing the dataset""" 75 return self._data.project_id 76 77 @property 78 def status(self) -> Status: 79 """ 80 Status of the dataset 81 """ 82 return self._data.status 83 84 @property 85 def source_dataset_ids(self) -> List[str]: 86 """IDs of the datasets used as sources for this dataset (if any)""" 87 return self._data.source_dataset_ids 88 89 @property 90 def source_datasets(self) -> List['DataPortalDataset']: 91 """ 92 Objects representing the datasets used as sources for this dataset (if any) 93 """ 94 return [ 95 DataPortalDataset( 96 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 97 client=self._client 98 ) 99 for dataset_id in self.source_dataset_ids 100 ] 101 102 @property 103 def params(self) -> dict: 104 """ 105 Parameters used to generate the dataset 106 """ 107 return self._get_detail().params.to_dict() 108 109 @property 110 def info(self) -> dict: 111 """ 112 Extra information about the dataset 113 """ 114 return self._get_detail().info.to_dict() 115 116 @property 117 def tags(self) -> List[Tag]: 118 """ 119 Tags applied to the dataset 120 """ 121 return self._data.tags 122 123 @property 124 def share(self) -> Optional[NamedItem]: 125 """ 126 Share associated with the dataset, if any. 127 """ 128 return self._get_detail().share 129 130 @property 131 def created_by(self) -> str: 132 """User who created the dataset""" 133 return self._data.created_by 134 135 @property 136 def created_at(self) -> datetime.datetime: 137 """Timestamp of dataset creation""" 138 return self._data.created_at 139 140 def _get_detail(self): 141 if not isinstance(self._data, DatasetDetail): 142 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 143 return self._data 144 145 def _get_assets(self): 146 if not self._assets: 147 self._assets = self._client.datasets.get_assets_listing( 148 project_id=self.project_id, 149 dataset_id=self.id 150 ) 151 return self._assets 152 153 def __str__(self): 154 return '\n'.join([ 155 f"{i.title()}: {self.__getattribute__(i)}" 156 for i in ['name', 'id', 'description', 'status'] 157 ]) 158 159 def get_file(self, relative_path: str) -> DataPortalFile: 160 """ 161 Get a file from the dataset using its relative path. 162 163 Args: 164 relative_path (str): Relative path of file within the dataset 165 166 Returns: 167 `from cirro.sdk.file import DataPortalFile` 168 """ 169 170 # Get the list of files in this dataset 171 files = self.list_files() 172 173 # Try getting the file using the relative path provided by the user 174 try: 175 return files.get_by_id(relative_path) 176 except DataPortalAssetNotFound: 177 # Try getting the file with the 'data/' prefix prepended 178 try: 179 return files.get_by_id("data/" + relative_path) 180 except DataPortalAssetNotFound: 181 # If not found, raise the exception using the string provided 182 # by the user, not the data/ prepended version (which may be 183 # confusing to the user) 184 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 185 raise DataPortalAssetNotFound(msg) 186 187 def list_files(self) -> DataPortalFiles: 188 """ 189 Return the list of files which make up the dataset. 190 """ 191 files = self._get_assets().files 192 return DataPortalFiles( 193 [ 194 DataPortalFile(file=file, client=self._client) 195 for file in files 196 ] 197 ) 198 199 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 200 """ 201 Get the artifact of a particular type from the dataset 202 """ 203 artifacts = self._get_assets().artifacts 204 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 205 if artifact is None: 206 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 207 return DataPortalFile(file=artifact.file, client=self._client) 208 209 def list_artifacts(self) -> List[DataPortalFile]: 210 """ 211 Return the list of artifacts associated with the dataset 212 213 An artifact may be something generated as part of the analysis or other process. 214 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 215 216 """ 217 artifacts = self._get_assets().artifacts 218 return DataPortalFiles( 219 [ 220 DataPortalFile(file=artifact.file, client=self._client) 221 for artifact in artifacts 222 ] 223 ) 224 225 def download_files(self, download_location: str = None) -> None: 226 """ 227 Download all the files from the dataset to a local directory. 228 229 Args: 230 download_location (str): Path to local directory 231 """ 232 233 # Alias for internal method 234 self.list_files().download(download_location) 235 236 def run_analysis( 237 self, 238 name: str = None, 239 description: str = "", 240 process: Union[DataPortalProcess, str] = None, 241 params=None, 242 notifications_emails: List[str] = None, 243 compute_environment: str = None, 244 resume_dataset_id: str = None 245 ) -> str: 246 """ 247 Runs an analysis on a dataset, returns the ID of the newly created dataset. 248 249 The process can be provided as either a DataPortalProcess object, 250 or a string which corresponds to the name or ID of the process. 251 252 Args: 253 name (str): Name of newly created dataset 254 description (str): Description of newly created dataset 255 process (DataPortalProcess or str): Process to run 256 params (dict): Analysis parameters 257 notifications_emails (List[str]): Notification email address(es) 258 compute_environment (str): Name or ID of compute environment to use, 259 if blank it will run in AWS 260 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 261 It will attempt to re-use the previous output to minimize duplicate work 262 263 Returns: 264 dataset_id (str): ID of newly created dataset 265 """ 266 if name is None: 267 raise DataPortalInputError("Must specify 'name' for run_analysis") 268 if process is None: 269 raise DataPortalInputError("Must specify 'process' for run_analysis") 270 if notifications_emails is None: 271 notifications_emails = [] 272 if params is None: 273 params = {} 274 275 # If the process is a string, try to parse it as a process name or ID 276 process = parse_process_name_or_id(process, self._client) 277 278 if compute_environment: 279 compute_environments = self._client.compute_environments.list_environments_for_project( 280 project_id=self.project_id 281 ) 282 compute_environment = next( 283 (env for env in compute_environments 284 if env.name == compute_environment or env.id == compute_environment), 285 None 286 ) 287 if compute_environment is None: 288 raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") 289 290 resp = self._client.execution.run_analysis( 291 project_id=self.project_id, 292 request=RunAnalysisRequest( 293 name=name, 294 description=description, 295 process_id=process.id, 296 source_dataset_ids=[self.id], 297 params=RunAnalysisRequestParams.from_dict(params), 298 notification_emails=notifications_emails, 299 resume_dataset_id=resume_dataset_id, 300 compute_environment_id=compute_environment.id if compute_environment else None 301 ) 302 ) 303 return resp.id 304 305 306class DataPortalDatasets(DataPortalAssets[DataPortalDataset]): 307 """Collection of multiple DataPortalDataset objects.""" 308 asset_name = "dataset"
18class DataPortalDataset(DataPortalAsset): 19 """ 20 Datasets in the Data Portal are collections of files which have 21 either been uploaded directly, or which have been output by 22 an analysis pipeline or notebook. 23 """ 24 25 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 26 """ 27 Instantiate a dataset object 28 29 Should be invoked from a top-level constructor, for example: 30 31 ```python 32 from cirro import DataPortal() 33 portal = DataPortal() 34 dataset = portal.get_dataset( 35 project="id-or-name-of-project", 36 dataset="id-or-name-of-dataset" 37 ) 38 ``` 39 40 """ 41 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 42 self._data = dataset 43 self._assets: Optional[DatasetAssets] = None 44 self._client = client 45 46 @property 47 def id(self) -> str: 48 """Unique identifier for the dataset""" 49 return self._data.id 50 51 @property 52 def name(self) -> str: 53 """Editable name for the dataset""" 54 return self._data.name 55 56 @property 57 def description(self) -> str: 58 """Longer name for the dataset""" 59 return self._data.description 60 61 @property 62 def process_id(self) -> str: 63 """Unique ID of process used to create the dataset""" 64 return self._data.process_id 65 66 @property 67 def process(self) -> ProcessDetail: 68 """ 69 Object representing the process used to create the dataset 70 """ 71 return self._client.processes.get(self.process_id) 72 73 @property 74 def project_id(self) -> str: 75 """ID of the project containing the dataset""" 76 return self._data.project_id 77 78 @property 79 def status(self) -> Status: 80 """ 81 Status of the dataset 82 """ 83 return self._data.status 84 85 @property 86 def source_dataset_ids(self) -> List[str]: 87 """IDs of the datasets used as sources for this dataset (if any)""" 88 return self._data.source_dataset_ids 89 90 @property 91 def source_datasets(self) -> List['DataPortalDataset']: 92 """ 93 Objects representing the datasets used as sources for this dataset (if any) 94 """ 95 return [ 96 DataPortalDataset( 97 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 98 client=self._client 99 ) 100 for dataset_id in self.source_dataset_ids 101 ] 102 103 @property 104 def params(self) -> dict: 105 """ 106 Parameters used to generate the dataset 107 """ 108 return self._get_detail().params.to_dict() 109 110 @property 111 def info(self) -> dict: 112 """ 113 Extra information about the dataset 114 """ 115 return self._get_detail().info.to_dict() 116 117 @property 118 def tags(self) -> List[Tag]: 119 """ 120 Tags applied to the dataset 121 """ 122 return self._data.tags 123 124 @property 125 def share(self) -> Optional[NamedItem]: 126 """ 127 Share associated with the dataset, if any. 128 """ 129 return self._get_detail().share 130 131 @property 132 def created_by(self) -> str: 133 """User who created the dataset""" 134 return self._data.created_by 135 136 @property 137 def created_at(self) -> datetime.datetime: 138 """Timestamp of dataset creation""" 139 return self._data.created_at 140 141 def _get_detail(self): 142 if not isinstance(self._data, DatasetDetail): 143 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 144 return self._data 145 146 def _get_assets(self): 147 if not self._assets: 148 self._assets = self._client.datasets.get_assets_listing( 149 project_id=self.project_id, 150 dataset_id=self.id 151 ) 152 return self._assets 153 154 def __str__(self): 155 return '\n'.join([ 156 f"{i.title()}: {self.__getattribute__(i)}" 157 for i in ['name', 'id', 'description', 'status'] 158 ]) 159 160 def get_file(self, relative_path: str) -> DataPortalFile: 161 """ 162 Get a file from the dataset using its relative path. 163 164 Args: 165 relative_path (str): Relative path of file within the dataset 166 167 Returns: 168 `from cirro.sdk.file import DataPortalFile` 169 """ 170 171 # Get the list of files in this dataset 172 files = self.list_files() 173 174 # Try getting the file using the relative path provided by the user 175 try: 176 return files.get_by_id(relative_path) 177 except DataPortalAssetNotFound: 178 # Try getting the file with the 'data/' prefix prepended 179 try: 180 return files.get_by_id("data/" + relative_path) 181 except DataPortalAssetNotFound: 182 # If not found, raise the exception using the string provided 183 # by the user, not the data/ prepended version (which may be 184 # confusing to the user) 185 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 186 raise DataPortalAssetNotFound(msg) 187 188 def list_files(self) -> DataPortalFiles: 189 """ 190 Return the list of files which make up the dataset. 191 """ 192 files = self._get_assets().files 193 return DataPortalFiles( 194 [ 195 DataPortalFile(file=file, client=self._client) 196 for file in files 197 ] 198 ) 199 200 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 201 """ 202 Get the artifact of a particular type from the dataset 203 """ 204 artifacts = self._get_assets().artifacts 205 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 206 if artifact is None: 207 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 208 return DataPortalFile(file=artifact.file, client=self._client) 209 210 def list_artifacts(self) -> List[DataPortalFile]: 211 """ 212 Return the list of artifacts associated with the dataset 213 214 An artifact may be something generated as part of the analysis or other process. 215 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 216 217 """ 218 artifacts = self._get_assets().artifacts 219 return DataPortalFiles( 220 [ 221 DataPortalFile(file=artifact.file, client=self._client) 222 for artifact in artifacts 223 ] 224 ) 225 226 def download_files(self, download_location: str = None) -> None: 227 """ 228 Download all the files from the dataset to a local directory. 229 230 Args: 231 download_location (str): Path to local directory 232 """ 233 234 # Alias for internal method 235 self.list_files().download(download_location) 236 237 def run_analysis( 238 self, 239 name: str = None, 240 description: str = "", 241 process: Union[DataPortalProcess, str] = None, 242 params=None, 243 notifications_emails: List[str] = None, 244 compute_environment: str = None, 245 resume_dataset_id: str = None 246 ) -> str: 247 """ 248 Runs an analysis on a dataset, returns the ID of the newly created dataset. 249 250 The process can be provided as either a DataPortalProcess object, 251 or a string which corresponds to the name or ID of the process. 252 253 Args: 254 name (str): Name of newly created dataset 255 description (str): Description of newly created dataset 256 process (DataPortalProcess or str): Process to run 257 params (dict): Analysis parameters 258 notifications_emails (List[str]): Notification email address(es) 259 compute_environment (str): Name or ID of compute environment to use, 260 if blank it will run in AWS 261 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 262 It will attempt to re-use the previous output to minimize duplicate work 263 264 Returns: 265 dataset_id (str): ID of newly created dataset 266 """ 267 if name is None: 268 raise DataPortalInputError("Must specify 'name' for run_analysis") 269 if process is None: 270 raise DataPortalInputError("Must specify 'process' for run_analysis") 271 if notifications_emails is None: 272 notifications_emails = [] 273 if params is None: 274 params = {} 275 276 # If the process is a string, try to parse it as a process name or ID 277 process = parse_process_name_or_id(process, self._client) 278 279 if compute_environment: 280 compute_environments = self._client.compute_environments.list_environments_for_project( 281 project_id=self.project_id 282 ) 283 compute_environment = next( 284 (env for env in compute_environments 285 if env.name == compute_environment or env.id == compute_environment), 286 None 287 ) 288 if compute_environment is None: 289 raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") 290 291 resp = self._client.execution.run_analysis( 292 project_id=self.project_id, 293 request=RunAnalysisRequest( 294 name=name, 295 description=description, 296 process_id=process.id, 297 source_dataset_ids=[self.id], 298 params=RunAnalysisRequestParams.from_dict(params), 299 notification_emails=notifications_emails, 300 resume_dataset_id=resume_dataset_id, 301 compute_environment_id=compute_environment.id if compute_environment else None 302 ) 303 ) 304 return resp.id
Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.
25 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 26 """ 27 Instantiate a dataset object 28 29 Should be invoked from a top-level constructor, for example: 30 31 ```python 32 from cirro import DataPortal() 33 portal = DataPortal() 34 dataset = portal.get_dataset( 35 project="id-or-name-of-project", 36 dataset="id-or-name-of-dataset" 37 ) 38 ``` 39 40 """ 41 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 42 self._data = dataset 43 self._assets: Optional[DatasetAssets] = None 44 self._client = client
Instantiate a dataset object
Should be invoked from a top-level constructor, for example:
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
46 @property 47 def id(self) -> str: 48 """Unique identifier for the dataset""" 49 return self._data.id
Unique identifier for the dataset
51 @property 52 def name(self) -> str: 53 """Editable name for the dataset""" 54 return self._data.name
Editable name for the dataset
56 @property 57 def description(self) -> str: 58 """Longer name for the dataset""" 59 return self._data.description
Longer name for the dataset
61 @property 62 def process_id(self) -> str: 63 """Unique ID of process used to create the dataset""" 64 return self._data.process_id
Unique ID of process used to create the dataset
66 @property 67 def process(self) -> ProcessDetail: 68 """ 69 Object representing the process used to create the dataset 70 """ 71 return self._client.processes.get(self.process_id)
Object representing the process used to create the dataset
73 @property 74 def project_id(self) -> str: 75 """ID of the project containing the dataset""" 76 return self._data.project_id
ID of the project containing the dataset
78 @property 79 def status(self) -> Status: 80 """ 81 Status of the dataset 82 """ 83 return self._data.status
Status of the dataset
85 @property 86 def source_dataset_ids(self) -> List[str]: 87 """IDs of the datasets used as sources for this dataset (if any)""" 88 return self._data.source_dataset_ids
IDs of the datasets used as sources for this dataset (if any)
90 @property 91 def source_datasets(self) -> List['DataPortalDataset']: 92 """ 93 Objects representing the datasets used as sources for this dataset (if any) 94 """ 95 return [ 96 DataPortalDataset( 97 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 98 client=self._client 99 ) 100 for dataset_id in self.source_dataset_ids 101 ]
Objects representing the datasets used as sources for this dataset (if any)
103 @property 104 def params(self) -> dict: 105 """ 106 Parameters used to generate the dataset 107 """ 108 return self._get_detail().params.to_dict()
Parameters used to generate the dataset
110 @property 111 def info(self) -> dict: 112 """ 113 Extra information about the dataset 114 """ 115 return self._get_detail().info.to_dict()
Extra information about the dataset
131 @property 132 def created_by(self) -> str: 133 """User who created the dataset""" 134 return self._data.created_by
User who created the dataset
136 @property 137 def created_at(self) -> datetime.datetime: 138 """Timestamp of dataset creation""" 139 return self._data.created_at
Timestamp of dataset creation
160 def get_file(self, relative_path: str) -> DataPortalFile: 161 """ 162 Get a file from the dataset using its relative path. 163 164 Args: 165 relative_path (str): Relative path of file within the dataset 166 167 Returns: 168 `from cirro.sdk.file import DataPortalFile` 169 """ 170 171 # Get the list of files in this dataset 172 files = self.list_files() 173 174 # Try getting the file using the relative path provided by the user 175 try: 176 return files.get_by_id(relative_path) 177 except DataPortalAssetNotFound: 178 # Try getting the file with the 'data/' prefix prepended 179 try: 180 return files.get_by_id("data/" + relative_path) 181 except DataPortalAssetNotFound: 182 # If not found, raise the exception using the string provided 183 # by the user, not the data/ prepended version (which may be 184 # confusing to the user) 185 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 186 raise DataPortalAssetNotFound(msg)
Get a file from the dataset using its relative path.
Arguments:
- relative_path (str): Relative path of file within the dataset
Returns:
from cirro.sdk.file import DataPortalFile
188 def list_files(self) -> DataPortalFiles: 189 """ 190 Return the list of files which make up the dataset. 191 """ 192 files = self._get_assets().files 193 return DataPortalFiles( 194 [ 195 DataPortalFile(file=file, client=self._client) 196 for file in files 197 ] 198 )
Return the list of files which make up the dataset.
200 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 201 """ 202 Get the artifact of a particular type from the dataset 203 """ 204 artifacts = self._get_assets().artifacts 205 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 206 if artifact is None: 207 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 208 return DataPortalFile(file=artifact.file, client=self._client)
Get the artifact of a particular type from the dataset
210 def list_artifacts(self) -> List[DataPortalFile]: 211 """ 212 Return the list of artifacts associated with the dataset 213 214 An artifact may be something generated as part of the analysis or other process. 215 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 216 217 """ 218 artifacts = self._get_assets().artifacts 219 return DataPortalFiles( 220 [ 221 DataPortalFile(file=artifact.file, client=self._client) 222 for artifact in artifacts 223 ] 224 )
Return the list of artifacts associated with the dataset
An artifact may be something generated as part of the analysis or other process.
See cirro_api_client.v1.models.ArtifactType
for the list of possible artifact types.
226 def download_files(self, download_location: str = None) -> None: 227 """ 228 Download all the files from the dataset to a local directory. 229 230 Args: 231 download_location (str): Path to local directory 232 """ 233 234 # Alias for internal method 235 self.list_files().download(download_location)
Download all the files from the dataset to a local directory.
Arguments:
- download_location (str): Path to local directory
237 def run_analysis( 238 self, 239 name: str = None, 240 description: str = "", 241 process: Union[DataPortalProcess, str] = None, 242 params=None, 243 notifications_emails: List[str] = None, 244 compute_environment: str = None, 245 resume_dataset_id: str = None 246 ) -> str: 247 """ 248 Runs an analysis on a dataset, returns the ID of the newly created dataset. 249 250 The process can be provided as either a DataPortalProcess object, 251 or a string which corresponds to the name or ID of the process. 252 253 Args: 254 name (str): Name of newly created dataset 255 description (str): Description of newly created dataset 256 process (DataPortalProcess or str): Process to run 257 params (dict): Analysis parameters 258 notifications_emails (List[str]): Notification email address(es) 259 compute_environment (str): Name or ID of compute environment to use, 260 if blank it will run in AWS 261 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 262 It will attempt to re-use the previous output to minimize duplicate work 263 264 Returns: 265 dataset_id (str): ID of newly created dataset 266 """ 267 if name is None: 268 raise DataPortalInputError("Must specify 'name' for run_analysis") 269 if process is None: 270 raise DataPortalInputError("Must specify 'process' for run_analysis") 271 if notifications_emails is None: 272 notifications_emails = [] 273 if params is None: 274 params = {} 275 276 # If the process is a string, try to parse it as a process name or ID 277 process = parse_process_name_or_id(process, self._client) 278 279 if compute_environment: 280 compute_environments = self._client.compute_environments.list_environments_for_project( 281 project_id=self.project_id 282 ) 283 compute_environment = next( 284 (env for env in compute_environments 285 if env.name == compute_environment or env.id == compute_environment), 286 None 287 ) 288 if compute_environment is None: 289 raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") 290 291 resp = self._client.execution.run_analysis( 292 project_id=self.project_id, 293 request=RunAnalysisRequest( 294 name=name, 295 description=description, 296 process_id=process.id, 297 source_dataset_ids=[self.id], 298 params=RunAnalysisRequestParams.from_dict(params), 299 notification_emails=notifications_emails, 300 resume_dataset_id=resume_dataset_id, 301 compute_environment_id=compute_environment.id if compute_environment else None 302 ) 303 ) 304 return resp.id
Runs an analysis on a dataset, returns the ID of the newly created dataset.
The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (DataPortalProcess or str): Process to run
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
- compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
- resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
Returns:
dataset_id (str): ID of newly created dataset
307class DataPortalDatasets(DataPortalAssets[DataPortalDataset]): 308 """Collection of multiple DataPortalDataset objects.""" 309 asset_name = "dataset"
Collection of multiple DataPortalDataset objects.