cirro.sdk.dataset

View Source

  1import datetime
  2from typing import Union, List, Optional
  3
  4from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, FileEntry, \
  5    ProcessDetail, Status, DatasetDetailParams, RunAnalysisRequestParams, DatasetDetailInfo, \
  6    Tag
  7
  8from cirro.cirro_client import CirroApi
  9from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 10from cirro.sdk.exceptions import DataPortalInputError
 11from cirro.sdk.file import DataPortalFile, DataPortalFiles
 12from cirro.sdk.helpers import parse_process_name_or_id
 13from cirro.sdk.process import DataPortalProcess
 14
 15
 16class DataPortalDataset(DataPortalAsset):
 17    """
 18    Datasets in the Data Portal are collections of files which have
 19    either been uploaded directly, or which have been output by
 20    an analysis pipeline or notebook.
 21    """
 22
 23    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
 24        """
 25        Instantiate a dataset object
 26
 27        Should be invoked from a top-level constructor, for example:
 28
 29        ```python
 30        from cirro import DataPortal()
 31        portal = DataPortal()
 32        dataset = portal.get_dataset(
 33            project="id-or-name-of-project",
 34            dataset="id-or-name-of-dataset"
 35        )
 36        ```
 37
 38        """
 39        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
 40        self._data = dataset
 41        self._files: Optional[List[FileEntry]] = None
 42        self._client = client
 43
 44    @property
 45    def id(self) -> str:
 46        """Unique identifier for the dataset"""
 47        return self._data.id
 48
 49    @property
 50    def name(self) -> str:
 51        """Editible name for the dataset"""
 52        return self._data.name
 53
 54    @property
 55    def description(self) -> str:
 56        """Longer name for the dataset"""
 57        return self._data.description
 58
 59    @property
 60    def process_id(self) -> str:
 61        """Unique ID of process used to create the dataset"""
 62        return self._data.process_id
 63
 64    @property
 65    def process(self) -> ProcessDetail:
 66        """
 67        Object representing the process used to create the dataset
 68        """
 69        return self._client.processes.get(self.process_id)
 70
 71    @property
 72    def project_id(self) -> str:
 73        """ID of the project containing the dataset"""
 74        return self._data.project_id
 75
 76    @property
 77    def status(self) -> Status:
 78        """
 79        Status of the dataset
 80        """
 81        return self._data.status
 82
 83    @property
 84    def source_dataset_ids(self) -> List[str]:
 85        """IDs of the datasets used as sources for this dataset (if any)"""
 86        return self._data.source_dataset_ids
 87
 88    @property
 89    def source_datasets(self) -> List['DataPortalDataset']:
 90        """
 91        Objects representing the datasets used as sources for this dataset (if any)
 92        """
 93        return [
 94            DataPortalDataset(
 95                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 96                client=self._client
 97            )
 98            for dataset_id in self.source_dataset_ids
 99        ]
100
101    @property
102    def params(self) -> DatasetDetailParams:
103        """
104        Parameters used to generate the dataset
105        """
106        return self._get_detail().params
107
108    @property
109    def info(self) -> DatasetDetailInfo:
110        """
111        Detailed information about the dataset
112        """
113        return self._get_detail().info
114
115    @property
116    def tags(self) -> List[Tag]:
117        """
118        Tags applied to the dataset
119        """
120        return self._data.tags
121
122    @property
123    def created_by(self) -> str:
124        """User who created the dataset"""
125        return self._data.created_by
126
127    @property
128    def created_at(self) -> datetime.datetime:
129        """Timestamp of dataset creation"""
130        return self._data.created_at
131
132    def _get_detail(self):
133        if not isinstance(self._data, DatasetDetail):
134            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
135        return self._data
136
137    def __str__(self):
138        return '\n'.join([
139            f"{i.title()}: {self.__getattribute__(i)}"
140            for i in ['name', 'id', 'description', 'status']
141        ])
142
143    def list_files(self) -> DataPortalFiles:
144        """
145        Return the list of files which make up the dataset.
146        """
147        if not self._files:
148            self._files = DataPortalFiles(
149                [
150                    DataPortalFile(file=file, client=self._client)
151                    for file in self._client.datasets.get_file_listing(
152                        project_id=self.project_id,
153                        dataset_id=self.id
154                    )
155                ]
156            )
157        return self._files
158
159    def download_files(self, download_location: str = None) -> None:
160        """
161        Download all the files from the dataset to a local directory.
162
163        Args:
164            download_location (str): Path to local directory
165        """
166
167        # Alias for internal method
168        self.list_files().download(download_location)
169
170    def run_analysis(
171            self,
172            name: str = None,
173            description: str = "",
174            process: Union[DataPortalProcess, str] = None,
175            params=None,
176            notifications_emails=None
177    ) -> str:
178        """
179        Runs an analysis on a dataset, returns the ID of the newly created dataset.
180
181        The process can be provided as either a DataPortalProcess object,
182        or a string which corresponds to the name or ID of the process.
183
184        Args:
185            name (str): Name of newly created dataset
186            description (str): Description of newly created dataset
187            process (DataPortalProcess or str): Process to run
188            params (dict): Analysis parameters
189            notifications_emails (List[str]): Notification email address(es)
190
191        Returns:
192            dataset_id (str): ID of newly created dataset
193        """
194        if name is None:
195            raise DataPortalInputError("Must specify 'name' for run_analysis")
196        if process is None:
197            raise DataPortalInputError("Must specify 'process' for run_analysis")
198        if notifications_emails is None:
199            notifications_emails = []
200        if params is None:
201            params = {}
202
203        # If the process is a string, try to parse it as a process name or ID
204        process = parse_process_name_or_id(process, self._client)
205
206        resp = self._client.execution.run_analysis(
207            project_id=self.project_id,
208            request=RunAnalysisRequest(
209                name=name,
210                description=description,
211                process_id=process.id,
212                source_dataset_ids=[self.id],
213                params=RunAnalysisRequestParams.from_dict(params),
214                notification_emails=notifications_emails
215            )
216        )
217        return resp.id
218
219
220class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
221    """Collection of multiple DataPortalDataset objects."""
222    asset_name = "dataset"

class DataPortalDataset(cirro.sdk.asset.DataPortalAsset): View Source

 17class DataPortalDataset(DataPortalAsset):
 18    """
 19    Datasets in the Data Portal are collections of files which have
 20    either been uploaded directly, or which have been output by
 21    an analysis pipeline or notebook.
 22    """
 23
 24    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
 25        """
 26        Instantiate a dataset object
 27
 28        Should be invoked from a top-level constructor, for example:
 29
 30        ```python
 31        from cirro import DataPortal()
 32        portal = DataPortal()
 33        dataset = portal.get_dataset(
 34            project="id-or-name-of-project",
 35            dataset="id-or-name-of-dataset"
 36        )
 37        ```
 38
 39        """
 40        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
 41        self._data = dataset
 42        self._files: Optional[List[FileEntry]] = None
 43        self._client = client
 44
 45    @property
 46    def id(self) -> str:
 47        """Unique identifier for the dataset"""
 48        return self._data.id
 49
 50    @property
 51    def name(self) -> str:
 52        """Editible name for the dataset"""
 53        return self._data.name
 54
 55    @property
 56    def description(self) -> str:
 57        """Longer name for the dataset"""
 58        return self._data.description
 59
 60    @property
 61    def process_id(self) -> str:
 62        """Unique ID of process used to create the dataset"""
 63        return self._data.process_id
 64
 65    @property
 66    def process(self) -> ProcessDetail:
 67        """
 68        Object representing the process used to create the dataset
 69        """
 70        return self._client.processes.get(self.process_id)
 71
 72    @property
 73    def project_id(self) -> str:
 74        """ID of the project containing the dataset"""
 75        return self._data.project_id
 76
 77    @property
 78    def status(self) -> Status:
 79        """
 80        Status of the dataset
 81        """
 82        return self._data.status
 83
 84    @property
 85    def source_dataset_ids(self) -> List[str]:
 86        """IDs of the datasets used as sources for this dataset (if any)"""
 87        return self._data.source_dataset_ids
 88
 89    @property
 90    def source_datasets(self) -> List['DataPortalDataset']:
 91        """
 92        Objects representing the datasets used as sources for this dataset (if any)
 93        """
 94        return [
 95            DataPortalDataset(
 96                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 97                client=self._client
 98            )
 99            for dataset_id in self.source_dataset_ids
100        ]
101
102    @property
103    def params(self) -> DatasetDetailParams:
104        """
105        Parameters used to generate the dataset
106        """
107        return self._get_detail().params
108
109    @property
110    def info(self) -> DatasetDetailInfo:
111        """
112        Detailed information about the dataset
113        """
114        return self._get_detail().info
115
116    @property
117    def tags(self) -> List[Tag]:
118        """
119        Tags applied to the dataset
120        """
121        return self._data.tags
122
123    @property
124    def created_by(self) -> str:
125        """User who created the dataset"""
126        return self._data.created_by
127
128    @property
129    def created_at(self) -> datetime.datetime:
130        """Timestamp of dataset creation"""
131        return self._data.created_at
132
133    def _get_detail(self):
134        if not isinstance(self._data, DatasetDetail):
135            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
136        return self._data
137
138    def __str__(self):
139        return '\n'.join([
140            f"{i.title()}: {self.__getattribute__(i)}"
141            for i in ['name', 'id', 'description', 'status']
142        ])
143
144    def list_files(self) -> DataPortalFiles:
145        """
146        Return the list of files which make up the dataset.
147        """
148        if not self._files:
149            self._files = DataPortalFiles(
150                [
151                    DataPortalFile(file=file, client=self._client)
152                    for file in self._client.datasets.get_file_listing(
153                        project_id=self.project_id,
154                        dataset_id=self.id
155                    )
156                ]
157            )
158        return self._files
159
160    def download_files(self, download_location: str = None) -> None:
161        """
162        Download all the files from the dataset to a local directory.
163
164        Args:
165            download_location (str): Path to local directory
166        """
167
168        # Alias for internal method
169        self.list_files().download(download_location)
170
171    def run_analysis(
172            self,
173            name: str = None,
174            description: str = "",
175            process: Union[DataPortalProcess, str] = None,
176            params=None,
177            notifications_emails=None
178    ) -> str:
179        """
180        Runs an analysis on a dataset, returns the ID of the newly created dataset.
181
182        The process can be provided as either a DataPortalProcess object,
183        or a string which corresponds to the name or ID of the process.
184
185        Args:
186            name (str): Name of newly created dataset
187            description (str): Description of newly created dataset
188            process (DataPortalProcess or str): Process to run
189            params (dict): Analysis parameters
190            notifications_emails (List[str]): Notification email address(es)
191
192        Returns:
193            dataset_id (str): ID of newly created dataset
194        """
195        if name is None:
196            raise DataPortalInputError("Must specify 'name' for run_analysis")
197        if process is None:
198            raise DataPortalInputError("Must specify 'process' for run_analysis")
199        if notifications_emails is None:
200            notifications_emails = []
201        if params is None:
202            params = {}
203
204        # If the process is a string, try to parse it as a process name or ID
205        process = parse_process_name_or_id(process, self._client)
206
207        resp = self._client.execution.run_analysis(
208            project_id=self.project_id,
209            request=RunAnalysisRequest(
210                name=name,
211                description=description,
212                process_id=process.id,
213                source_dataset_ids=[self.id],
214                params=RunAnalysisRequestParams.from_dict(params),
215                notification_emails=notifications_emails
216            )
217        )
218        return resp.id

Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.

DataPortalDataset( dataset: Union[cirro_api_client.v1.models.Dataset, cirro_api_client.v1.models.DatasetDetail], client: cirro.CirroApi) View Source

24    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
25        """
26        Instantiate a dataset object
27
28        Should be invoked from a top-level constructor, for example:
29
30        ```python
31        from cirro import DataPortal()
32        portal = DataPortal()
33        dataset = portal.get_dataset(
34            project="id-or-name-of-project",
35            dataset="id-or-name-of-dataset"
36        )
37        ```
38
39        """
40        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
41        self._data = dataset
42        self._files: Optional[List[FileEntry]] = None
43        self._client = client

Instantiate a dataset object

Should be invoked from a top-level constructor, for example:

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)

id: str View Source

45    @property
46    def id(self) -> str:
47        """Unique identifier for the dataset"""
48        return self._data.id

Unique identifier for the dataset

name: str View Source

50    @property
51    def name(self) -> str:
52        """Editible name for the dataset"""
53        return self._data.name

Editible name for the dataset

description: str View Source

55    @property
56    def description(self) -> str:
57        """Longer name for the dataset"""
58        return self._data.description

Longer name for the dataset

process_id: str View Source

60    @property
61    def process_id(self) -> str:
62        """Unique ID of process used to create the dataset"""
63        return self._data.process_id

Unique ID of process used to create the dataset

process: cirro_api_client.v1.models.ProcessDetail View Source

65    @property
66    def process(self) -> ProcessDetail:
67        """
68        Object representing the process used to create the dataset
69        """
70        return self._client.processes.get(self.process_id)

Object representing the process used to create the dataset

project_id: str View Source

72    @property
73    def project_id(self) -> str:
74        """ID of the project containing the dataset"""
75        return self._data.project_id

ID of the project containing the dataset

status: cirro_api_client.v1.models.Status View Source

77    @property
78    def status(self) -> Status:
79        """
80        Status of the dataset
81        """
82        return self._data.status

Status of the dataset

source_dataset_ids: List[str] View Source

84    @property
85    def source_dataset_ids(self) -> List[str]:
86        """IDs of the datasets used as sources for this dataset (if any)"""
87        return self._data.source_dataset_ids

IDs of the datasets used as sources for this dataset (if any)

source_datasets: List[DataPortalDataset] View Source

 89    @property
 90    def source_datasets(self) -> List['DataPortalDataset']:
 91        """
 92        Objects representing the datasets used as sources for this dataset (if any)
 93        """
 94        return [
 95            DataPortalDataset(
 96                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
 97                client=self._client
 98            )
 99            for dataset_id in self.source_dataset_ids
100        ]

Objects representing the datasets used as sources for this dataset (if any)

params: cirro_api_client.v1.models.DatasetDetailParams View Source

102    @property
103    def params(self) -> DatasetDetailParams:
104        """
105        Parameters used to generate the dataset
106        """
107        return self._get_detail().params

Parameters used to generate the dataset

info: cirro_api_client.v1.models.DatasetDetailInfo View Source

109    @property
110    def info(self) -> DatasetDetailInfo:
111        """
112        Detailed information about the dataset
113        """
114        return self._get_detail().info

Detailed information about the dataset

tags: List[cirro_api_client.v1.models.Tag] View Source

116    @property
117    def tags(self) -> List[Tag]:
118        """
119        Tags applied to the dataset
120        """
121        return self._data.tags

Tags applied to the dataset

created_by: str View Source

123    @property
124    def created_by(self) -> str:
125        """User who created the dataset"""
126        return self._data.created_by

User who created the dataset

created_at: datetime.datetime View Source

128    @property
129    def created_at(self) -> datetime.datetime:
130        """Timestamp of dataset creation"""
131        return self._data.created_at

Timestamp of dataset creation

def list_files(self) -> cirro.sdk.file.DataPortalFiles: View Source

144    def list_files(self) -> DataPortalFiles:
145        """
146        Return the list of files which make up the dataset.
147        """
148        if not self._files:
149            self._files = DataPortalFiles(
150                [
151                    DataPortalFile(file=file, client=self._client)
152                    for file in self._client.datasets.get_file_listing(
153                        project_id=self.project_id,
154                        dataset_id=self.id
155                    )
156                ]
157            )
158        return self._files

Return the list of files which make up the dataset.

def download_files(self, download_location: str = None) -> None: View Source

160    def download_files(self, download_location: str = None) -> None:
161        """
162        Download all the files from the dataset to a local directory.
163
164        Args:
165            download_location (str): Path to local directory
166        """
167
168        # Alias for internal method
169        self.list_files().download(download_location)

Download all the files from the dataset to a local directory.

Arguments:

download_location (str): Path to local directory

def run_analysis( self, name: str = None, description: str = '', process: Union[cirro.sdk.process.DataPortalProcess, str] = None, params=None, notifications_emails=None) -> str: View Source

171    def run_analysis(
172            self,
173            name: str = None,
174            description: str = "",
175            process: Union[DataPortalProcess, str] = None,
176            params=None,
177            notifications_emails=None
178    ) -> str:
179        """
180        Runs an analysis on a dataset, returns the ID of the newly created dataset.
181
182        The process can be provided as either a DataPortalProcess object,
183        or a string which corresponds to the name or ID of the process.
184
185        Args:
186            name (str): Name of newly created dataset
187            description (str): Description of newly created dataset
188            process (DataPortalProcess or str): Process to run
189            params (dict): Analysis parameters
190            notifications_emails (List[str]): Notification email address(es)
191
192        Returns:
193            dataset_id (str): ID of newly created dataset
194        """
195        if name is None:
196            raise DataPortalInputError("Must specify 'name' for run_analysis")
197        if process is None:
198            raise DataPortalInputError("Must specify 'process' for run_analysis")
199        if notifications_emails is None:
200            notifications_emails = []
201        if params is None:
202            params = {}
203
204        # If the process is a string, try to parse it as a process name or ID
205        process = parse_process_name_or_id(process, self._client)
206
207        resp = self._client.execution.run_analysis(
208            project_id=self.project_id,
209            request=RunAnalysisRequest(
210                name=name,
211                description=description,
212                process_id=process.id,
213                source_dataset_ids=[self.id],
214                params=RunAnalysisRequestParams.from_dict(params),
215                notification_emails=notifications_emails
216            )
217        )
218        return resp.id

Runs an analysis on a dataset, returns the ID of the newly created dataset.

The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.

Arguments:

name (str): Name of newly created dataset
description (str): Description of newly created dataset
process (DataPortalProcess or str): Process to run
params (dict): Analysis parameters
notifications_emails (List[str]): Notification email address(es)

Returns:

dataset_id (str): ID of newly created dataset

class DataPortalDatasets(cirro.sdk.asset.DataPortalAssets[cirro.sdk.dataset.DataPortalDataset]): View Source

221class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
222    """Collection of multiple DataPortalDataset objects."""
223    asset_name = "dataset"

Collection of multiple DataPortalDataset objects.

asset_name = 'dataset'

Inherited Members

cirro.sdk.asset.DataPortalAssets: DataPortalAssets; description; get_by_name; get_by_id; filter_by_pattern
builtins.list: clear; copy; append; insert; extend; pop; remove; index; count; reverse; sort