cirro.sdk.dataset

  1import datetime
  2from typing import Union, List, Optional
  3
  4from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
  5    RunAnalysisRequestParams, Tag, ArtifactType, NamedItem
  6
  7from cirro.cirro_client import CirroApi
  8from cirro.models.assets import DatasetAssets
  9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 10from cirro.sdk.exceptions import DataPortalAssetNotFound
 11from cirro.sdk.exceptions import DataPortalInputError
 12from cirro.sdk.file import DataPortalFile, DataPortalFiles
 13from cirro.sdk.helpers import parse_process_name_or_id
 14from cirro.sdk.process import DataPortalProcess
 15
 16
 17class DataPortalDataset(DataPortalAsset):
 18    """
 19    Datasets in the Data Portal are collections of files which have
 20    either been uploaded directly, or which have been output by
 21    an analysis pipeline or notebook.
 22    """
 23
 24    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
 25        """
 26        Instantiate a dataset object
 27
 28        Should be invoked from a top-level constructor, for example:
 29
 30        ```python
 31        from cirro import DataPortal()
 32        portal = DataPortal()
 33        dataset = portal.get_dataset(
 34            project="id-or-name-of-project",
 35            dataset="id-or-name-of-dataset"
 36        )
 37        ```
 38
 39        """
 40        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
 41        self._data = dataset
 42        self._assets: Optional[DatasetAssets] = None
 43        self._client = client
 44
 45    @property
 46    def id(self) -> str:
 47        """Unique identifier for the dataset"""
 48        return self._data.id
 49
 50    @property
 51    def name(self) -> str:
 52        """Editable name for the dataset"""
 53        return self._data.name
 54
 55    @property
 56    def description(self) -> str:
 57        """Longer name for the dataset"""
 58        return self._data.description
 59
 60    @property
 61    def process_id(self) -> str:
 62        """Unique ID of process used to create the dataset"""
 63        return self._data.process_id
 64
 65    @property
 66    def process(self) -> ProcessDetail:
 67        """
 68        Object representing the process used to create the dataset
 69        """
 70        return self._client.processes.get(self.process_id)
 71
 72    @property
 73    def project_id(self) -> str:
 74        """ID of the project containing the dataset"""
 75        return self._data.project_id
 76
 77    @property
 78    def status(self) -> Status:
 79        """
 80        Status of the dataset
 81        """
 82        return self._data.status
 83
 84    @property
 85    def source_dataset_ids(self) -> List[str]:
 86        """IDs of the datasets used as sources for this dataset (if any)"""
 87        return self._data.source_dataset_ids
 88
 89    @property
 90    def source_datasets(self) -> List['DataPortalDataset']:
 91        """
 92        Objects representing the datasets used as sources for this dataset (if any)
 93        """
 94        return [
 95            DataPortalDataset(
 96                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 97                client=self._client
 98            )
 99            for dataset_id in self.source_dataset_ids
100        ]
101
102    @property
103    def params(self) -> dict:
104        """
105        Parameters used to generate the dataset
106        """
107        return self._get_detail().params.to_dict()
108
109    @property
110    def info(self) -> dict:
111        """
112        Extra information about the dataset
113        """
114        return self._get_detail().info.to_dict()
115
116    @property
117    def tags(self) -> List[Tag]:
118        """
119        Tags applied to the dataset
120        """
121        return self._data.tags
122
123    @property
124    def share(self) -> Optional[NamedItem]:
125        """
126        Share associated with the dataset, if any.
127        """
128        return self._get_detail().share
129
130    @property
131    def created_by(self) -> str:
132        """User who created the dataset"""
133        return self._data.created_by
134
135    @property
136    def created_at(self) -> datetime.datetime:
137        """Timestamp of dataset creation"""
138        return self._data.created_at
139
140    def _get_detail(self):
141        if not isinstance(self._data, DatasetDetail):
142            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
143        return self._data
144
145    def _get_assets(self):
146        if not self._assets:
147            self._assets = self._client.datasets.get_assets_listing(
148                project_id=self.project_id,
149                dataset_id=self.id
150            )
151        return self._assets
152
153    def __str__(self):
154        return '\n'.join([
155            f"{i.title()}: {self.__getattribute__(i)}"
156            for i in ['name', 'id', 'description', 'status']
157        ])
158
159    def get_file(self, relative_path: str) -> DataPortalFile:
160        """
161        Get a file from the dataset using its relative path.
162
163        Args:
164            relative_path (str): Relative path of file within the dataset
165
166        Returns:
167            `from cirro.sdk.file import DataPortalFile`
168        """
169
170        # Get the list of files in this dataset
171        files = self.list_files()
172
173        # Try getting the file using the relative path provided by the user
174        try:
175            return files.get_by_id(relative_path)
176        except DataPortalAssetNotFound:
177            # Try getting the file with the 'data/' prefix prepended
178            try:
179                return files.get_by_id("data/" + relative_path)
180            except DataPortalAssetNotFound:
181                # If not found, raise the exception using the string provided
182                # by the user, not the data/ prepended version (which may be
183                # confusing to the user)
184                msg = '\n'.join([f"No file found with path '{relative_path}'."])
185                raise DataPortalAssetNotFound(msg)
186
187    def list_files(self) -> DataPortalFiles:
188        """
189        Return the list of files which make up the dataset.
190        """
191        files = self._get_assets().files
192        return DataPortalFiles(
193            [
194                DataPortalFile(file=file, client=self._client)
195                for file in files
196            ]
197        )
198
199    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
200        """
201        Get the artifact of a particular type from the dataset
202        """
203        artifacts = self._get_assets().artifacts
204        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
205        if artifact is None:
206            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
207        return DataPortalFile(file=artifact.file, client=self._client)
208
209    def list_artifacts(self) -> List[DataPortalFile]:
210        """
211        Return the list of artifacts associated with the dataset
212
213        An artifact may be something generated as part of the analysis or other process.
214        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
215
216        """
217        artifacts = self._get_assets().artifacts
218        return DataPortalFiles(
219            [
220                DataPortalFile(file=artifact.file, client=self._client)
221                for artifact in artifacts
222            ]
223        )
224
225    def download_files(self, download_location: str = None) -> None:
226        """
227        Download all the files from the dataset to a local directory.
228
229        Args:
230            download_location (str): Path to local directory
231        """
232
233        # Alias for internal method
234        self.list_files().download(download_location)
235
236    def run_analysis(
237            self,
238            name: str = None,
239            description: str = "",
240            process: Union[DataPortalProcess, str] = None,
241            params=None,
242            notifications_emails: List[str] = None,
243            compute_environment: str = None,
244            resume_dataset_id: str = None
245    ) -> str:
246        """
247        Runs an analysis on a dataset, returns the ID of the newly created dataset.
248
249        The process can be provided as either a DataPortalProcess object,
250        or a string which corresponds to the name or ID of the process.
251
252        Args:
253            name (str): Name of newly created dataset
254            description (str): Description of newly created dataset
255            process (DataPortalProcess or str): Process to run
256            params (dict): Analysis parameters
257            notifications_emails (List[str]): Notification email address(es)
258            compute_environment (str): Name or ID of compute environment to use,
259             if blank it will run in AWS
260            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
261             It will attempt to re-use the previous output to minimize duplicate work
262
263        Returns:
264            dataset_id (str): ID of newly created dataset
265        """
266        if name is None:
267            raise DataPortalInputError("Must specify 'name' for run_analysis")
268        if process is None:
269            raise DataPortalInputError("Must specify 'process' for run_analysis")
270        if notifications_emails is None:
271            notifications_emails = []
272        if params is None:
273            params = {}
274
275        # If the process is a string, try to parse it as a process name or ID
276        process = parse_process_name_or_id(process, self._client)
277
278        if compute_environment:
279            compute_environments = self._client.compute_environments.list_environments_for_project(
280                project_id=self.project_id
281            )
282            compute_environment = next(
283                (env for env in compute_environments
284                 if env.name == compute_environment or env.id == compute_environment),
285                None
286            )
287            if compute_environment is None:
288                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
289
290        resp = self._client.execution.run_analysis(
291            project_id=self.project_id,
292            request=RunAnalysisRequest(
293                name=name,
294                description=description,
295                process_id=process.id,
296                source_dataset_ids=[self.id],
297                params=RunAnalysisRequestParams.from_dict(params),
298                notification_emails=notifications_emails,
299                resume_dataset_id=resume_dataset_id,
300                compute_environment_id=compute_environment.id if compute_environment else None
301            )
302        )
303        return resp.id
304
305
306class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
307    """Collection of multiple DataPortalDataset objects."""
308    asset_name = "dataset"
class DataPortalDataset(cirro.sdk.asset.DataPortalAsset):
 18class DataPortalDataset(DataPortalAsset):
 19    """
 20    Datasets in the Data Portal are collections of files which have
 21    either been uploaded directly, or which have been output by
 22    an analysis pipeline or notebook.
 23    """
 24
 25    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
 26        """
 27        Instantiate a dataset object
 28
 29        Should be invoked from a top-level constructor, for example:
 30
 31        ```python
 32        from cirro import DataPortal()
 33        portal = DataPortal()
 34        dataset = portal.get_dataset(
 35            project="id-or-name-of-project",
 36            dataset="id-or-name-of-dataset"
 37        )
 38        ```
 39
 40        """
 41        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
 42        self._data = dataset
 43        self._assets: Optional[DatasetAssets] = None
 44        self._client = client
 45
 46    @property
 47    def id(self) -> str:
 48        """Unique identifier for the dataset"""
 49        return self._data.id
 50
 51    @property
 52    def name(self) -> str:
 53        """Editable name for the dataset"""
 54        return self._data.name
 55
 56    @property
 57    def description(self) -> str:
 58        """Longer name for the dataset"""
 59        return self._data.description
 60
 61    @property
 62    def process_id(self) -> str:
 63        """Unique ID of process used to create the dataset"""
 64        return self._data.process_id
 65
 66    @property
 67    def process(self) -> ProcessDetail:
 68        """
 69        Object representing the process used to create the dataset
 70        """
 71        return self._client.processes.get(self.process_id)
 72
 73    @property
 74    def project_id(self) -> str:
 75        """ID of the project containing the dataset"""
 76        return self._data.project_id
 77
 78    @property
 79    def status(self) -> Status:
 80        """
 81        Status of the dataset
 82        """
 83        return self._data.status
 84
 85    @property
 86    def source_dataset_ids(self) -> List[str]:
 87        """IDs of the datasets used as sources for this dataset (if any)"""
 88        return self._data.source_dataset_ids
 89
 90    @property
 91    def source_datasets(self) -> List['DataPortalDataset']:
 92        """
 93        Objects representing the datasets used as sources for this dataset (if any)
 94        """
 95        return [
 96            DataPortalDataset(
 97                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 98                client=self._client
 99            )
100            for dataset_id in self.source_dataset_ids
101        ]
102
103    @property
104    def params(self) -> dict:
105        """
106        Parameters used to generate the dataset
107        """
108        return self._get_detail().params.to_dict()
109
110    @property
111    def info(self) -> dict:
112        """
113        Extra information about the dataset
114        """
115        return self._get_detail().info.to_dict()
116
117    @property
118    def tags(self) -> List[Tag]:
119        """
120        Tags applied to the dataset
121        """
122        return self._data.tags
123
124    @property
125    def share(self) -> Optional[NamedItem]:
126        """
127        Share associated with the dataset, if any.
128        """
129        return self._get_detail().share
130
131    @property
132    def created_by(self) -> str:
133        """User who created the dataset"""
134        return self._data.created_by
135
136    @property
137    def created_at(self) -> datetime.datetime:
138        """Timestamp of dataset creation"""
139        return self._data.created_at
140
141    def _get_detail(self):
142        if not isinstance(self._data, DatasetDetail):
143            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
144        return self._data
145
146    def _get_assets(self):
147        if not self._assets:
148            self._assets = self._client.datasets.get_assets_listing(
149                project_id=self.project_id,
150                dataset_id=self.id
151            )
152        return self._assets
153
154    def __str__(self):
155        return '\n'.join([
156            f"{i.title()}: {self.__getattribute__(i)}"
157            for i in ['name', 'id', 'description', 'status']
158        ])
159
160    def get_file(self, relative_path: str) -> DataPortalFile:
161        """
162        Get a file from the dataset using its relative path.
163
164        Args:
165            relative_path (str): Relative path of file within the dataset
166
167        Returns:
168            `from cirro.sdk.file import DataPortalFile`
169        """
170
171        # Get the list of files in this dataset
172        files = self.list_files()
173
174        # Try getting the file using the relative path provided by the user
175        try:
176            return files.get_by_id(relative_path)
177        except DataPortalAssetNotFound:
178            # Try getting the file with the 'data/' prefix prepended
179            try:
180                return files.get_by_id("data/" + relative_path)
181            except DataPortalAssetNotFound:
182                # If not found, raise the exception using the string provided
183                # by the user, not the data/ prepended version (which may be
184                # confusing to the user)
185                msg = '\n'.join([f"No file found with path '{relative_path}'."])
186                raise DataPortalAssetNotFound(msg)
187
188    def list_files(self) -> DataPortalFiles:
189        """
190        Return the list of files which make up the dataset.
191        """
192        files = self._get_assets().files
193        return DataPortalFiles(
194            [
195                DataPortalFile(file=file, client=self._client)
196                for file in files
197            ]
198        )
199
200    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
201        """
202        Get the artifact of a particular type from the dataset
203        """
204        artifacts = self._get_assets().artifacts
205        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
206        if artifact is None:
207            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
208        return DataPortalFile(file=artifact.file, client=self._client)
209
210    def list_artifacts(self) -> List[DataPortalFile]:
211        """
212        Return the list of artifacts associated with the dataset
213
214        An artifact may be something generated as part of the analysis or other process.
215        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
216
217        """
218        artifacts = self._get_assets().artifacts
219        return DataPortalFiles(
220            [
221                DataPortalFile(file=artifact.file, client=self._client)
222                for artifact in artifacts
223            ]
224        )
225
226    def download_files(self, download_location: str = None) -> None:
227        """
228        Download all the files from the dataset to a local directory.
229
230        Args:
231            download_location (str): Path to local directory
232        """
233
234        # Alias for internal method
235        self.list_files().download(download_location)
236
237    def run_analysis(
238            self,
239            name: str = None,
240            description: str = "",
241            process: Union[DataPortalProcess, str] = None,
242            params=None,
243            notifications_emails: List[str] = None,
244            compute_environment: str = None,
245            resume_dataset_id: str = None
246    ) -> str:
247        """
248        Runs an analysis on a dataset, returns the ID of the newly created dataset.
249
250        The process can be provided as either a DataPortalProcess object,
251        or a string which corresponds to the name or ID of the process.
252
253        Args:
254            name (str): Name of newly created dataset
255            description (str): Description of newly created dataset
256            process (DataPortalProcess or str): Process to run
257            params (dict): Analysis parameters
258            notifications_emails (List[str]): Notification email address(es)
259            compute_environment (str): Name or ID of compute environment to use,
260             if blank it will run in AWS
261            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
262             It will attempt to re-use the previous output to minimize duplicate work
263
264        Returns:
265            dataset_id (str): ID of newly created dataset
266        """
267        if name is None:
268            raise DataPortalInputError("Must specify 'name' for run_analysis")
269        if process is None:
270            raise DataPortalInputError("Must specify 'process' for run_analysis")
271        if notifications_emails is None:
272            notifications_emails = []
273        if params is None:
274            params = {}
275
276        # If the process is a string, try to parse it as a process name or ID
277        process = parse_process_name_or_id(process, self._client)
278
279        if compute_environment:
280            compute_environments = self._client.compute_environments.list_environments_for_project(
281                project_id=self.project_id
282            )
283            compute_environment = next(
284                (env for env in compute_environments
285                 if env.name == compute_environment or env.id == compute_environment),
286                None
287            )
288            if compute_environment is None:
289                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
290
291        resp = self._client.execution.run_analysis(
292            project_id=self.project_id,
293            request=RunAnalysisRequest(
294                name=name,
295                description=description,
296                process_id=process.id,
297                source_dataset_ids=[self.id],
298                params=RunAnalysisRequestParams.from_dict(params),
299                notification_emails=notifications_emails,
300                resume_dataset_id=resume_dataset_id,
301                compute_environment_id=compute_environment.id if compute_environment else None
302            )
303        )
304        return resp.id

Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.

DataPortalDataset( dataset: Union[cirro_api_client.v1.models.Dataset, cirro_api_client.v1.models.DatasetDetail], client: cirro.CirroApi)
25    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
26        """
27        Instantiate a dataset object
28
29        Should be invoked from a top-level constructor, for example:
30
31        ```python
32        from cirro import DataPortal()
33        portal = DataPortal()
34        dataset = portal.get_dataset(
35            project="id-or-name-of-project",
36            dataset="id-or-name-of-dataset"
37        )
38        ```
39
40        """
41        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
42        self._data = dataset
43        self._assets: Optional[DatasetAssets] = None
44        self._client = client

Instantiate a dataset object

Should be invoked from a top-level constructor, for example:

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
id: str
46    @property
47    def id(self) -> str:
48        """Unique identifier for the dataset"""
49        return self._data.id

Unique identifier for the dataset

name: str
51    @property
52    def name(self) -> str:
53        """Editable name for the dataset"""
54        return self._data.name

Editable name for the dataset

description: str
56    @property
57    def description(self) -> str:
58        """Longer name for the dataset"""
59        return self._data.description

Longer name for the dataset

process_id: str
61    @property
62    def process_id(self) -> str:
63        """Unique ID of process used to create the dataset"""
64        return self._data.process_id

Unique ID of process used to create the dataset

66    @property
67    def process(self) -> ProcessDetail:
68        """
69        Object representing the process used to create the dataset
70        """
71        return self._client.processes.get(self.process_id)

Object representing the process used to create the dataset

project_id: str
73    @property
74    def project_id(self) -> str:
75        """ID of the project containing the dataset"""
76        return self._data.project_id

ID of the project containing the dataset

status: cirro_api_client.v1.models.Status
78    @property
79    def status(self) -> Status:
80        """
81        Status of the dataset
82        """
83        return self._data.status

Status of the dataset

source_dataset_ids: List[str]
85    @property
86    def source_dataset_ids(self) -> List[str]:
87        """IDs of the datasets used as sources for this dataset (if any)"""
88        return self._data.source_dataset_ids

IDs of the datasets used as sources for this dataset (if any)

source_datasets: List[DataPortalDataset]
 90    @property
 91    def source_datasets(self) -> List['DataPortalDataset']:
 92        """
 93        Objects representing the datasets used as sources for this dataset (if any)
 94        """
 95        return [
 96            DataPortalDataset(
 97                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 98                client=self._client
 99            )
100            for dataset_id in self.source_dataset_ids
101        ]

Objects representing the datasets used as sources for this dataset (if any)

params: dict
103    @property
104    def params(self) -> dict:
105        """
106        Parameters used to generate the dataset
107        """
108        return self._get_detail().params.to_dict()

Parameters used to generate the dataset

info: dict
110    @property
111    def info(self) -> dict:
112        """
113        Extra information about the dataset
114        """
115        return self._get_detail().info.to_dict()

Extra information about the dataset

tags: List[cirro_api_client.v1.models.Tag]
117    @property
118    def tags(self) -> List[Tag]:
119        """
120        Tags applied to the dataset
121        """
122        return self._data.tags

Tags applied to the dataset

share: Optional[cirro_api_client.v1.models.NamedItem]
124    @property
125    def share(self) -> Optional[NamedItem]:
126        """
127        Share associated with the dataset, if any.
128        """
129        return self._get_detail().share

Share associated with the dataset, if any.

created_by: str
131    @property
132    def created_by(self) -> str:
133        """User who created the dataset"""
134        return self._data.created_by

User who created the dataset

created_at: datetime.datetime
136    @property
137    def created_at(self) -> datetime.datetime:
138        """Timestamp of dataset creation"""
139        return self._data.created_at

Timestamp of dataset creation

def get_file(self, relative_path: str) -> cirro.sdk.file.DataPortalFile:
160    def get_file(self, relative_path: str) -> DataPortalFile:
161        """
162        Get a file from the dataset using its relative path.
163
164        Args:
165            relative_path (str): Relative path of file within the dataset
166
167        Returns:
168            `from cirro.sdk.file import DataPortalFile`
169        """
170
171        # Get the list of files in this dataset
172        files = self.list_files()
173
174        # Try getting the file using the relative path provided by the user
175        try:
176            return files.get_by_id(relative_path)
177        except DataPortalAssetNotFound:
178            # Try getting the file with the 'data/' prefix prepended
179            try:
180                return files.get_by_id("data/" + relative_path)
181            except DataPortalAssetNotFound:
182                # If not found, raise the exception using the string provided
183                # by the user, not the data/ prepended version (which may be
184                # confusing to the user)
185                msg = '\n'.join([f"No file found with path '{relative_path}'."])
186                raise DataPortalAssetNotFound(msg)

Get a file from the dataset using its relative path.

Arguments:
  • relative_path (str): Relative path of file within the dataset
Returns:

from cirro.sdk.file import DataPortalFile

def list_files(self) -> cirro.sdk.file.DataPortalFiles:
188    def list_files(self) -> DataPortalFiles:
189        """
190        Return the list of files which make up the dataset.
191        """
192        files = self._get_assets().files
193        return DataPortalFiles(
194            [
195                DataPortalFile(file=file, client=self._client)
196                for file in files
197            ]
198        )

Return the list of files which make up the dataset.

def get_artifact( self, artifact_type: cirro_api_client.v1.models.ArtifactType) -> cirro.sdk.file.DataPortalFile:
200    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
201        """
202        Get the artifact of a particular type from the dataset
203        """
204        artifacts = self._get_assets().artifacts
205        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
206        if artifact is None:
207            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
208        return DataPortalFile(file=artifact.file, client=self._client)

Get the artifact of a particular type from the dataset

def list_artifacts(self) -> List[cirro.sdk.file.DataPortalFile]:
210    def list_artifacts(self) -> List[DataPortalFile]:
211        """
212        Return the list of artifacts associated with the dataset
213
214        An artifact may be something generated as part of the analysis or other process.
215        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
216
217        """
218        artifacts = self._get_assets().artifacts
219        return DataPortalFiles(
220            [
221                DataPortalFile(file=artifact.file, client=self._client)
222                for artifact in artifacts
223            ]
224        )

Return the list of artifacts associated with the dataset

An artifact may be something generated as part of the analysis or other process. See cirro_api_client.v1.models.ArtifactType for the list of possible artifact types.

def download_files(self, download_location: str = None) -> None:
226    def download_files(self, download_location: str = None) -> None:
227        """
228        Download all the files from the dataset to a local directory.
229
230        Args:
231            download_location (str): Path to local directory
232        """
233
234        # Alias for internal method
235        self.list_files().download(download_location)

Download all the files from the dataset to a local directory.

Arguments:
  • download_location (str): Path to local directory
def run_analysis( self, name: str = None, description: str = '', process: Union[cirro.DataPortalProcess, str] = None, params=None, notifications_emails: List[str] = None, compute_environment: str = None, resume_dataset_id: str = None) -> str:
237    def run_analysis(
238            self,
239            name: str = None,
240            description: str = "",
241            process: Union[DataPortalProcess, str] = None,
242            params=None,
243            notifications_emails: List[str] = None,
244            compute_environment: str = None,
245            resume_dataset_id: str = None
246    ) -> str:
247        """
248        Runs an analysis on a dataset, returns the ID of the newly created dataset.
249
250        The process can be provided as either a DataPortalProcess object,
251        or a string which corresponds to the name or ID of the process.
252
253        Args:
254            name (str): Name of newly created dataset
255            description (str): Description of newly created dataset
256            process (DataPortalProcess or str): Process to run
257            params (dict): Analysis parameters
258            notifications_emails (List[str]): Notification email address(es)
259            compute_environment (str): Name or ID of compute environment to use,
260             if blank it will run in AWS
261            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
262             It will attempt to re-use the previous output to minimize duplicate work
263
264        Returns:
265            dataset_id (str): ID of newly created dataset
266        """
267        if name is None:
268            raise DataPortalInputError("Must specify 'name' for run_analysis")
269        if process is None:
270            raise DataPortalInputError("Must specify 'process' for run_analysis")
271        if notifications_emails is None:
272            notifications_emails = []
273        if params is None:
274            params = {}
275
276        # If the process is a string, try to parse it as a process name or ID
277        process = parse_process_name_or_id(process, self._client)
278
279        if compute_environment:
280            compute_environments = self._client.compute_environments.list_environments_for_project(
281                project_id=self.project_id
282            )
283            compute_environment = next(
284                (env for env in compute_environments
285                 if env.name == compute_environment or env.id == compute_environment),
286                None
287            )
288            if compute_environment is None:
289                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
290
291        resp = self._client.execution.run_analysis(
292            project_id=self.project_id,
293            request=RunAnalysisRequest(
294                name=name,
295                description=description,
296                process_id=process.id,
297                source_dataset_ids=[self.id],
298                params=RunAnalysisRequestParams.from_dict(params),
299                notification_emails=notifications_emails,
300                resume_dataset_id=resume_dataset_id,
301                compute_environment_id=compute_environment.id if compute_environment else None
302            )
303        )
304        return resp.id

Runs an analysis on a dataset, returns the ID of the newly created dataset.

The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.

Arguments:
  • name (str): Name of newly created dataset
  • description (str): Description of newly created dataset
  • process (DataPortalProcess or str): Process to run
  • params (dict): Analysis parameters
  • notifications_emails (List[str]): Notification email address(es)
  • compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
  • resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
Returns:

dataset_id (str): ID of newly created dataset

307class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
308    """Collection of multiple DataPortalDataset objects."""
309    asset_name = "dataset"

Collection of multiple DataPortalDataset objects.

asset_name = 'dataset'