cirro

 1import cirro.file_utils  # noqa
 2from cirro.cirro_client import CirroApi
 3from cirro.sdk.dataset import DataPortalDataset
 4from cirro.sdk.login import DataPortalLogin
 5from cirro.sdk.portal import DataPortal
 6from cirro.sdk.process import DataPortalProcess
 7from cirro.sdk.project import DataPortalProject
 8from cirro.sdk.reference import DataPortalReference
 9
10__all__ = [
11    'DataPortal',
12    'DataPortalLogin',
13    'DataPortalProject',
14    'DataPortalProcess',
15    'DataPortalDataset',
16    'DataPortalReference',
17    'CirroApi',
18    'file_utils'
19]
class DataPortal:
 13class DataPortal:
 14    """
 15    Helper functions for exploring the Projects, Datasets, Samples, and Files
 16    available in the Data Portal.
 17    """
 18
 19    def __init__(self, base_url: str = None, client: CirroApi = None):
 20        """
 21        Set up the DataPortal object, establishing an authenticated connection.
 22
 23        Args:
 24            base_url (str): Optional base URL of the Cirro instance
 25             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
 26            client (`cirro.cirro_client.CirroApi`): Optional pre-configured client
 27
 28        Example:
 29        ```python
 30        from cirro import DataPortal
 31
 32        Portal = DataPortal(base_url="app.cirro.bio")
 33        portal.list_projects()
 34        ```
 35        """
 36
 37        if client is not None:
 38            self._client = client
 39
 40        # Set up default client if not provided
 41        else:
 42            self._client = CirroApi(base_url=base_url)
 43
 44    def list_projects(self) -> DataPortalProjects:
 45        """List all the projects available in the Data Portal."""
 46
 47        return DataPortalProjects(
 48            [
 49                DataPortalProject(proj, self._client)
 50                for proj in self._client.projects.list()
 51            ]
 52        )
 53
 54    def get_project_by_name(self, name: str = None) -> DataPortalProject:
 55        """Return the project with the specified name."""
 56
 57        return self.list_projects().get_by_name(name)
 58
 59    def get_project_by_id(self, _id: str = None) -> DataPortalProject:
 60        """Return the project with the specified id."""
 61
 62        return self.list_projects().get_by_id(_id)
 63
 64    def get_project(self, project: str = None) -> DataPortalProject:
 65        """
 66        Return a project identified by ID or name.
 67
 68        Args:
 69            project (str): ID or name of project
 70
 71        Returns:
 72            `from cirro.sdk.project import DataPortalProject`
 73        """
 74        try:
 75            return self.get_project_by_id(project)
 76        except DataPortalAssetNotFound:
 77            return self.get_project_by_name(project)
 78
 79    def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset:
 80        """
 81        Return a dataset identified by ID or name.
 82
 83        Args:
 84            project (str): ID or name of project
 85            dataset (str): ID or name of dataset
 86
 87        Returns:
 88            `cirro.sdk.dataset.DataPortalDataset`
 89
 90            ```python
 91            from cirro import DataPortal()
 92            portal = DataPortal()
 93            dataset = portal.get_dataset(
 94                project="id-or-name-of-project",
 95                dataset="id-or-name-of-dataset"
 96            )
 97            ```
 98        """
 99        try:
100            project: DataPortalProject = self.get_project_by_id(project)
101        except DataPortalAssetNotFound:
102            project: DataPortalProject = self.get_project_by_name(project)
103
104        try:
105            return project.get_dataset_by_id(dataset)
106        except DataPortalAssetNotFound:
107            return project.get_dataset_by_name(dataset)
108
109    def list_processes(self, ingest=False) -> DataPortalProcesses:
110        """
111        List all the processes available in the Data Portal.
112        By default, only list non-ingest processes (those which can be run on existing datasets).
113        To list the processes which can be used to upload datasets, use `ingest = True`.
114
115        Args:
116            ingest (bool): If True, only list those processes which can be used to ingest datasets directly
117        """
118
119        return DataPortalProcesses(
120            [
121                DataPortalProcess(p, self._client)
122                for p in self._client.processes.list()
123                if not ingest or p.executor == Executor.INGEST
124            ]
125        )
126
127    def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
128        """
129        Return the process with the specified name.
130
131        Args:
132            name (str): Name of process
133        """
134
135        return self.list_processes(ingest=ingest).get_by_name(name)
136
137    def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
138        """
139        Return the process with the specified id
140
141        Args:
142            id (str): ID of process
143        """
144
145        return self.list_processes(ingest=ingest).get_by_id(id)
146
147    def list_reference_types(self) -> DataPortalReferenceTypes:
148        """
149        Return the list of all available reference types
150        """
151
152        return DataPortalReferenceTypes(
153            [
154                DataPortalReferenceType(ref)
155                for ref in self._client.references.get_types()
156            ]
157        )
158
159    @property
160    def developer_helper(self) -> DeveloperHelper:
161        return DeveloperHelper(self._client)

Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.

DataPortal(base_url: str = None, client: CirroApi = None)
19    def __init__(self, base_url: str = None, client: CirroApi = None):
20        """
21        Set up the DataPortal object, establishing an authenticated connection.
22
23        Args:
24            base_url (str): Optional base URL of the Cirro instance
25             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
26            client (`cirro.cirro_client.CirroApi`): Optional pre-configured client
27
28        Example:
29        ```python
30        from cirro import DataPortal
31
32        Portal = DataPortal(base_url="app.cirro.bio")
33        portal.list_projects()
34        ```
35        """
36
37        if client is not None:
38            self._client = client
39
40        # Set up default client if not provided
41        else:
42            self._client = CirroApi(base_url=base_url)

Set up the DataPortal object, establishing an authenticated connection.

Arguments:
  • base_url (str): Optional base URL of the Cirro instance (if not provided, it uses the CIRRO_BASE_URL environment variable, or the config file)
  • client (cirro.cirro_client.CirroApi): Optional pre-configured client

Example:

from cirro import DataPortal

Portal = DataPortal(base_url="app.cirro.bio")
portal.list_projects()
def list_projects(self) -> cirro.sdk.project.DataPortalProjects:
44    def list_projects(self) -> DataPortalProjects:
45        """List all the projects available in the Data Portal."""
46
47        return DataPortalProjects(
48            [
49                DataPortalProject(proj, self._client)
50                for proj in self._client.projects.list()
51            ]
52        )

List all the projects available in the Data Portal.

def get_project_by_name(self, name: str = None) -> DataPortalProject:
54    def get_project_by_name(self, name: str = None) -> DataPortalProject:
55        """Return the project with the specified name."""
56
57        return self.list_projects().get_by_name(name)

Return the project with the specified name.

def get_project_by_id(self, _id: str = None) -> DataPortalProject:
59    def get_project_by_id(self, _id: str = None) -> DataPortalProject:
60        """Return the project with the specified id."""
61
62        return self.list_projects().get_by_id(_id)

Return the project with the specified id.

def get_project(self, project: str = None) -> DataPortalProject:
64    def get_project(self, project: str = None) -> DataPortalProject:
65        """
66        Return a project identified by ID or name.
67
68        Args:
69            project (str): ID or name of project
70
71        Returns:
72            `from cirro.sdk.project import DataPortalProject`
73        """
74        try:
75            return self.get_project_by_id(project)
76        except DataPortalAssetNotFound:
77            return self.get_project_by_name(project)

Return a project identified by ID or name.

Arguments:
  • project (str): ID or name of project
Returns:

from cirro.sdk.project import DataPortalProject

def get_dataset( self, project: str = None, dataset: str = None) -> DataPortalDataset:
 79    def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset:
 80        """
 81        Return a dataset identified by ID or name.
 82
 83        Args:
 84            project (str): ID or name of project
 85            dataset (str): ID or name of dataset
 86
 87        Returns:
 88            `cirro.sdk.dataset.DataPortalDataset`
 89
 90            ```python
 91            from cirro import DataPortal()
 92            portal = DataPortal()
 93            dataset = portal.get_dataset(
 94                project="id-or-name-of-project",
 95                dataset="id-or-name-of-dataset"
 96            )
 97            ```
 98        """
 99        try:
100            project: DataPortalProject = self.get_project_by_id(project)
101        except DataPortalAssetNotFound:
102            project: DataPortalProject = self.get_project_by_name(project)
103
104        try:
105            return project.get_dataset_by_id(dataset)
106        except DataPortalAssetNotFound:
107            return project.get_dataset_by_name(dataset)

Return a dataset identified by ID or name.

Arguments:
  • project (str): ID or name of project
  • dataset (str): ID or name of dataset
Returns:

cirro.sdk.dataset.DataPortalDataset

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
def list_processes(self, ingest=False) -> cirro.sdk.process.DataPortalProcesses:
109    def list_processes(self, ingest=False) -> DataPortalProcesses:
110        """
111        List all the processes available in the Data Portal.
112        By default, only list non-ingest processes (those which can be run on existing datasets).
113        To list the processes which can be used to upload datasets, use `ingest = True`.
114
115        Args:
116            ingest (bool): If True, only list those processes which can be used to ingest datasets directly
117        """
118
119        return DataPortalProcesses(
120            [
121                DataPortalProcess(p, self._client)
122                for p in self._client.processes.list()
123                if not ingest or p.executor == Executor.INGEST
124            ]
125        )

List all the processes available in the Data Portal. By default, only list non-ingest processes (those which can be run on existing datasets). To list the processes which can be used to upload datasets, use ingest = True.

Arguments:
  • ingest (bool): If True, only list those processes which can be used to ingest datasets directly
def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
127    def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
128        """
129        Return the process with the specified name.
130
131        Args:
132            name (str): Name of process
133        """
134
135        return self.list_processes(ingest=ingest).get_by_name(name)

Return the process with the specified name.

Arguments:
  • name (str): Name of process
def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
137    def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
138        """
139        Return the process with the specified id
140
141        Args:
142            id (str): ID of process
143        """
144
145        return self.list_processes(ingest=ingest).get_by_id(id)

Return the process with the specified id

Arguments:
  • id (str): ID of process
def list_reference_types(self) -> cirro.sdk.reference_type.DataPortalReferenceTypes:
147    def list_reference_types(self) -> DataPortalReferenceTypes:
148        """
149        Return the list of all available reference types
150        """
151
152        return DataPortalReferenceTypes(
153            [
154                DataPortalReferenceType(ref)
155                for ref in self._client.references.get_types()
156            ]
157        )

Return the list of all available reference types

developer_helper: cirro.sdk.developer.DeveloperHelper
159    @property
160    def developer_helper(self) -> DeveloperHelper:
161        return DeveloperHelper(self._client)
class DataPortalLogin:
 8class DataPortalLogin:
 9    """
10    Start the login process, obtaining the authorization message from Cirro
11    needed to confirm the user identity.
12
13    Useful when you need to authenticate a user in a non-blocking way.
14
15    Usage:
16
17    ```python
18    # Replace app.cirro.bio as appropriate
19    login = DataPortalLogin(base_url="app.cirro.bio")
20
21    # Present the user with the authorization message
22    print(login.auth_message)
23
24    # Generate the authenticated DataPortal object,
25    # blocking until the user completes the login process in their browser
26    portal = login.await_completion()
27    ```
28    """
29    base_url: str
30    auth_info: DeviceCodeAuth
31
32    def __init__(self, base_url: str = None, enable_cache=False):
33        app_config = AppConfig(base_url=base_url)
34
35        self.base_url = base_url
36
37        self.auth_info = DeviceCodeAuth(
38            region=app_config.region,
39            client_id=app_config.client_id,
40            auth_endpoint=app_config.auth_endpoint,
41            enable_cache=enable_cache,
42            await_completion=False
43        )
44
45    @property
46    def auth_message(self) -> str:
47        """Authorization message provided by Cirro."""
48        return self.auth_info.auth_message
49
50    @property
51    def auth_message_markdown(self) -> str:
52        """Authorization message provided by Cirro (Markdown format)."""
53        return self.auth_info.auth_message_markdown
54
55    def await_completion(self) -> DataPortal:
56        """Complete the login process and return an authenticated client"""
57
58        # Block until the user completes the login flow
59        self.auth_info.await_completion()
60
61        # Set up the client object
62        cirro_client = CirroApi(
63            auth_info=self.auth_info,
64            base_url=self.base_url
65        )
66
67        # Return the Data Portal object
68        return DataPortal(client=cirro_client)

Start the login process, obtaining the authorization message from Cirro needed to confirm the user identity.

Useful when you need to authenticate a user in a non-blocking way.

Usage:

# Replace app.cirro.bio as appropriate
login = DataPortalLogin(base_url="app.cirro.bio")

# Present the user with the authorization message
print(login.auth_message)

# Generate the authenticated DataPortal object,
# blocking until the user completes the login process in their browser
portal = login.await_completion()
DataPortalLogin(base_url: str = None, enable_cache=False)
32    def __init__(self, base_url: str = None, enable_cache=False):
33        app_config = AppConfig(base_url=base_url)
34
35        self.base_url = base_url
36
37        self.auth_info = DeviceCodeAuth(
38            region=app_config.region,
39            client_id=app_config.client_id,
40            auth_endpoint=app_config.auth_endpoint,
41            enable_cache=enable_cache,
42            await_completion=False
43        )
base_url: str
auth_message: str
45    @property
46    def auth_message(self) -> str:
47        """Authorization message provided by Cirro."""
48        return self.auth_info.auth_message

Authorization message provided by Cirro.

auth_message_markdown: str
50    @property
51    def auth_message_markdown(self) -> str:
52        """Authorization message provided by Cirro (Markdown format)."""
53        return self.auth_info.auth_message_markdown

Authorization message provided by Cirro (Markdown format).

def await_completion(self) -> DataPortal:
55    def await_completion(self) -> DataPortal:
56        """Complete the login process and return an authenticated client"""
57
58        # Block until the user completes the login flow
59        self.auth_info.await_completion()
60
61        # Set up the client object
62        cirro_client = CirroApi(
63            auth_info=self.auth_info,
64            base_url=self.base_url
65        )
66
67        # Return the Data Portal object
68        return DataPortal(client=cirro_client)

Complete the login process and return an authenticated client

class DataPortalProject(cirro.sdk.asset.DataPortalAsset):
 20class DataPortalProject(DataPortalAsset):
 21    """
 22    Projects in the Data Portal contain collections of Datasets.
 23    Users are granted permissions at the project-level, allowing them
 24    to view and/or modify all the datasets in that collection.
 25    """
 26    def __init__(self, proj: Project, client: CirroApi):
 27        """
 28        Instantiate with helper method
 29
 30        ```python
 31        from cirro import DataPortal()
 32        portal = DataPortal()
 33        project = portal.get_project_by_name("Project Name")
 34        ```
 35
 36        """
 37        self._data = proj
 38        self._client = client
 39
 40    @property
 41    def id(self) -> str:
 42        """
 43        Unique identifier
 44        """
 45        return self._data.id
 46
 47    @property
 48    def name(self) -> str:
 49        """
 50        Readable name
 51        """
 52        return self._data.name
 53
 54    @property
 55    def description(self) -> str:
 56        """
 57        Longer description of the project
 58        """
 59        return self._data.description
 60
 61    @property
 62    def status(self) -> Status:
 63        """
 64        Status of the project
 65        """
 66        return self._data.status
 67
 68    def __str__(self):
 69        """Control how the Project is rendered as a string."""
 70
 71        return '\n'.join([
 72            f"{i.title()}: {self.__getattribute__(i)}"
 73            for i in ['name', 'id', 'description']
 74        ])
 75
 76    @cache
 77    def _get_datasets(self) -> List[Dataset]:
 78        return list_all_datasets(project_id=self.id,
 79                                 client=self._client)
 80
 81    def list_datasets(self, force_refresh=False) -> DataPortalDatasets:
 82        """List all the datasets available in the project."""
 83        if force_refresh:
 84            self._get_datasets.cache_clear()
 85
 86        return DataPortalDatasets(
 87            [
 88                DataPortalDataset(d, self._client)
 89                for d in self._get_datasets()
 90            ]
 91        )
 92
 93    def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset:
 94        """Return the dataset with the specified name."""
 95        if force_refresh:
 96            self._get_datasets.cache_clear()
 97
 98        dataset = next((d for d in self._get_datasets() if d.name == name), None)
 99        if dataset is None:
100            raise DataPortalAssetNotFound(f'Dataset with name {name} not found')
101        return self.get_dataset_by_id(dataset.id)
102
103    def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset:
104        """Return the dataset with the specified id."""
105
106        dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id)
107        if dataset is None:
108            raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found')
109        return DataPortalDataset(dataset, self._client)
110
111    def list_references(self, reference_type: str = None) -> DataPortalReferences:
112        """
113        List the references available in a project.
114        Optionally filter to references of a particular type (identified by name)
115        """
116
117        # Get the complete list of references which are available
118        reference_types = DataPortalReferenceTypes(
119            [
120                DataPortalReferenceType(ref)
121                for ref in self._client.references.get_types()
122            ]
123        )
124
125        # If a particular name was specified
126        if reference_type is not None:
127            reference_types = reference_types.filter_by_pattern(reference_type)
128            if len(reference_types) == 0:
129                msg = f"Could not find any reference types with the name {reference_type}"
130                raise DataPortalAssetNotFound(msg)
131
132        return DataPortalReferences(
133            [
134                DataPortalReference(ref, project_id=self.id, client=self._client)
135                for ref in self._client.references.get_for_project(
136                    self.id
137                )
138                if reference_type is None or ref.type == reference_type
139            ]
140        )
141
142    def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference:
143        """Return the reference of a particular type with the specified name."""
144
145        if name is None:
146            raise DataPortalInputError("Must specify the reference name")
147
148        return self.list_references(ref_type).get_by_name(name)
149
150    def upload_dataset(
151        self,
152        name: str = None,
153        description='',
154        process: Union[DataPortalProcess, str] = None,
155        upload_folder: str = None,
156        files: List[str] = None,
157        tags: List[str] = None,
158    ):
159        """
160        Upload a set of files to the Data Portal, creating a new dataset.
161
162        If the files parameter is not provided, it will upload all files in the upload folder
163
164        Args:
165            name (str): Name of newly created dataset
166            description (str): Description of newly created dataset
167            process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
168            upload_folder (str): Folder containing files to upload
169            files (List[str]): Optional subset of files to upload from the folder
170            tags (List[str]): Optional list of tags to apply to the dataset
171        """
172
173        if name is None:
174            raise DataPortalInputError("Must provide name for new dataset")
175        if process is None:
176            raise DataPortalInputError("Must provide the process which is used for ingest")
177        if upload_folder is None:
178            raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload")
179
180        # Parse the process provided by the user
181        process = parse_process_name_or_id(process, self._client)
182
183        # If no files were provided
184        if files is None:
185            # Get the list of files in the upload folder
186            files = get_files_in_directory(upload_folder)
187
188        if files is None or len(files) == 0:
189            raise RuntimeWarning("No files to upload, exiting")
190
191        # Normalize into Tag object
192        if tags is not None:
193            tags = [Tag(value=value) for value in tags]
194
195        # Make sure that the files match the expected pattern
196        self._client.processes.check_dataset_files(files, process.id, upload_folder)
197
198        # Create the ingest process request
199        dataset_create_request = UploadDatasetRequest(
200            process_id=process.id,
201            name=name,
202            description=description,
203            expected_files=files,
204            tags=tags,
205        )
206
207        # Get the response
208        create_response = self._client.datasets.create(project_id=self.id,
209                                                       upload_request=dataset_create_request)
210
211        # Upload the files
212        self._client.datasets.upload_files(
213            project_id=self.id,
214            dataset_id=create_response.id,
215            directory=upload_folder,
216            files=files
217        )
218
219        # Return the dataset which was created, which might take a second to update
220        max_attempts = 5
221        for attempt in range(max_attempts):
222            try:
223                return self.get_dataset_by_id(create_response.id)
224            except DataPortalAssetNotFound as e:
225                if attempt == max_attempts - 1:
226                    raise e
227                else:
228                    sleep(2)
229
230    def samples(self, max_items: int = 10000) -> List[Sample]:
231        """
232        Retrieves a list of samples associated with a project along with their metadata
233
234        Args:
235            max_items (int): Maximum number of records to get (default 10,000)
236        """
237        return self._client.metadata.get_project_samples(self.id, max_items)

Projects in the Data Portal contain collections of Datasets. Users are granted permissions at the project-level, allowing them to view and/or modify all the datasets in that collection.

DataPortalProject( proj: cirro_api_client.v1.models.Project, client: CirroApi)
26    def __init__(self, proj: Project, client: CirroApi):
27        """
28        Instantiate with helper method
29
30        ```python
31        from cirro import DataPortal()
32        portal = DataPortal()
33        project = portal.get_project_by_name("Project Name")
34        ```
35
36        """
37        self._data = proj
38        self._client = client

Instantiate with helper method

from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
id: str
40    @property
41    def id(self) -> str:
42        """
43        Unique identifier
44        """
45        return self._data.id

Unique identifier

name: str
47    @property
48    def name(self) -> str:
49        """
50        Readable name
51        """
52        return self._data.name

Readable name

description: str
54    @property
55    def description(self) -> str:
56        """
57        Longer description of the project
58        """
59        return self._data.description

Longer description of the project

status: cirro_api_client.v1.models.Status
61    @property
62    def status(self) -> Status:
63        """
64        Status of the project
65        """
66        return self._data.status

Status of the project

def list_datasets(self, force_refresh=False) -> cirro.sdk.dataset.DataPortalDatasets:
81    def list_datasets(self, force_refresh=False) -> DataPortalDatasets:
82        """List all the datasets available in the project."""
83        if force_refresh:
84            self._get_datasets.cache_clear()
85
86        return DataPortalDatasets(
87            [
88                DataPortalDataset(d, self._client)
89                for d in self._get_datasets()
90            ]
91        )

List all the datasets available in the project.

def get_dataset_by_name( self, name: str, force_refresh=False) -> DataPortalDataset:
 93    def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset:
 94        """Return the dataset with the specified name."""
 95        if force_refresh:
 96            self._get_datasets.cache_clear()
 97
 98        dataset = next((d for d in self._get_datasets() if d.name == name), None)
 99        if dataset is None:
100            raise DataPortalAssetNotFound(f'Dataset with name {name} not found')
101        return self.get_dataset_by_id(dataset.id)

Return the dataset with the specified name.

def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset:
103    def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset:
104        """Return the dataset with the specified id."""
105
106        dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id)
107        if dataset is None:
108            raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found')
109        return DataPortalDataset(dataset, self._client)

Return the dataset with the specified id.

def list_references( self, reference_type: str = None) -> cirro.sdk.reference.DataPortalReferences:
111    def list_references(self, reference_type: str = None) -> DataPortalReferences:
112        """
113        List the references available in a project.
114        Optionally filter to references of a particular type (identified by name)
115        """
116
117        # Get the complete list of references which are available
118        reference_types = DataPortalReferenceTypes(
119            [
120                DataPortalReferenceType(ref)
121                for ref in self._client.references.get_types()
122            ]
123        )
124
125        # If a particular name was specified
126        if reference_type is not None:
127            reference_types = reference_types.filter_by_pattern(reference_type)
128            if len(reference_types) == 0:
129                msg = f"Could not find any reference types with the name {reference_type}"
130                raise DataPortalAssetNotFound(msg)
131
132        return DataPortalReferences(
133            [
134                DataPortalReference(ref, project_id=self.id, client=self._client)
135                for ref in self._client.references.get_for_project(
136                    self.id
137                )
138                if reference_type is None or ref.type == reference_type
139            ]
140        )

List the references available in a project. Optionally filter to references of a particular type (identified by name)

def get_reference_by_name( self, name: str = None, ref_type: str = None) -> DataPortalReference:
142    def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference:
143        """Return the reference of a particular type with the specified name."""
144
145        if name is None:
146            raise DataPortalInputError("Must specify the reference name")
147
148        return self.list_references(ref_type).get_by_name(name)

Return the reference of a particular type with the specified name.

def upload_dataset( self, name: str = None, description='', process: Union[DataPortalProcess, str] = None, upload_folder: str = None, files: List[str] = None, tags: List[str] = None):
150    def upload_dataset(
151        self,
152        name: str = None,
153        description='',
154        process: Union[DataPortalProcess, str] = None,
155        upload_folder: str = None,
156        files: List[str] = None,
157        tags: List[str] = None,
158    ):
159        """
160        Upload a set of files to the Data Portal, creating a new dataset.
161
162        If the files parameter is not provided, it will upload all files in the upload folder
163
164        Args:
165            name (str): Name of newly created dataset
166            description (str): Description of newly created dataset
167            process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
168            upload_folder (str): Folder containing files to upload
169            files (List[str]): Optional subset of files to upload from the folder
170            tags (List[str]): Optional list of tags to apply to the dataset
171        """
172
173        if name is None:
174            raise DataPortalInputError("Must provide name for new dataset")
175        if process is None:
176            raise DataPortalInputError("Must provide the process which is used for ingest")
177        if upload_folder is None:
178            raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload")
179
180        # Parse the process provided by the user
181        process = parse_process_name_or_id(process, self._client)
182
183        # If no files were provided
184        if files is None:
185            # Get the list of files in the upload folder
186            files = get_files_in_directory(upload_folder)
187
188        if files is None or len(files) == 0:
189            raise RuntimeWarning("No files to upload, exiting")
190
191        # Normalize into Tag object
192        if tags is not None:
193            tags = [Tag(value=value) for value in tags]
194
195        # Make sure that the files match the expected pattern
196        self._client.processes.check_dataset_files(files, process.id, upload_folder)
197
198        # Create the ingest process request
199        dataset_create_request = UploadDatasetRequest(
200            process_id=process.id,
201            name=name,
202            description=description,
203            expected_files=files,
204            tags=tags,
205        )
206
207        # Get the response
208        create_response = self._client.datasets.create(project_id=self.id,
209                                                       upload_request=dataset_create_request)
210
211        # Upload the files
212        self._client.datasets.upload_files(
213            project_id=self.id,
214            dataset_id=create_response.id,
215            directory=upload_folder,
216            files=files
217        )
218
219        # Return the dataset which was created, which might take a second to update
220        max_attempts = 5
221        for attempt in range(max_attempts):
222            try:
223                return self.get_dataset_by_id(create_response.id)
224            except DataPortalAssetNotFound as e:
225                if attempt == max_attempts - 1:
226                    raise e
227                else:
228                    sleep(2)

Upload a set of files to the Data Portal, creating a new dataset.

If the files parameter is not provided, it will upload all files in the upload folder

Arguments:
  • name (str): Name of newly created dataset
  • description (str): Description of newly created dataset
  • process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
  • upload_folder (str): Folder containing files to upload
  • files (List[str]): Optional subset of files to upload from the folder
  • tags (List[str]): Optional list of tags to apply to the dataset
def samples( self, max_items: int = 10000) -> List[cirro_api_client.v1.models.Sample]:
230    def samples(self, max_items: int = 10000) -> List[Sample]:
231        """
232        Retrieves a list of samples associated with a project along with their metadata
233
234        Args:
235            max_items (int): Maximum number of records to get (default 10,000)
236        """
237        return self._client.metadata.get_project_samples(self.id, max_items)

Retrieves a list of samples associated with a project along with their metadata

Arguments:
  • max_items (int): Maximum number of records to get (default 10,000)
class DataPortalProcess(cirro.sdk.asset.DataPortalAsset):
11class DataPortalProcess(DataPortalAsset):
12    """Helper functions for interacting with analysis processes."""
13
14    def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi):
15        """
16        Instantiate with helper method
17
18        ```python
19        from cirro import DataPortal()
20        portal = DataPortal()
21        process = portal.get_process_by_name("Process Name")
22        ```
23        """
24        self._data = process
25        self._client = client
26
27    @property
28    def id(self) -> str:
29        """Unique identifier"""
30        return self._data.id
31
32    @property
33    def name(self) -> str:
34        """Readable name"""
35        return self._data.name
36
37    @property
38    def description(self) -> str:
39        """Longer description of process"""
40        return self._data.description
41
42    @property
43    def child_process_ids(self) -> List[str]:
44        """List of processes which can be run on the output of this process"""
45        return self._data.child_process_ids
46
47    @property
48    def executor(self) -> Executor:
49        """INGEST, CROMWELL, or NEXTFLOW"""
50        return self._data.executor
51
52    @property
53    def category(self) -> str:
54        """Category of process"""
55        return self._data.category
56
57    @property
58    def pipeline_type(self) -> str:
59        """Pipeline type"""
60        return self._data.pipeline_type
61
62    @property
63    def documentation_url(self) -> str:
64        """Documentation URL"""
65        return self._data.documentation_url
66
67    @property
68    def file_requirements_message(self) -> str:
69        """Description of files required for INGEST processes"""
70        return self._data.file_requirements_message
71
72    @property
73    def code(self) -> PipelineCode:
74        """Pipeline code configuration"""
75        return self._get_detail().pipeline_code
76
77    @property
78    def custom_settings(self) -> CustomPipelineSettings:
79        """Custom settings for the process"""
80        return self._get_detail().custom_settings
81
82    def _get_detail(self) -> ProcessDetail:
83        if not isinstance(self._data, ProcessDetail):
84            self._data = self._client.processes.get(self.id)
85        return self._data
86
87    def __str__(self):
88        return '\n'.join([
89            f"{i.title()}: {self.__getattribute__(i)}"
90            for i in ['name', 'id', 'description']
91        ])
92
93    def get_parameter_spec(self) -> ParameterSpecification:
94        """
95        Gets a specification used to describe the parameters used in the process.
96        """
97        return self._client.processes.get_parameter_spec(self.id)

Helper functions for interacting with analysis processes.

DataPortalProcess( process: Union[cirro_api_client.v1.models.Process, cirro_api_client.v1.models.ProcessDetail], client: CirroApi)
14    def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi):
15        """
16        Instantiate with helper method
17
18        ```python
19        from cirro import DataPortal()
20        portal = DataPortal()
21        process = portal.get_process_by_name("Process Name")
22        ```
23        """
24        self._data = process
25        self._client = client

Instantiate with helper method

from cirro import DataPortal()
portal = DataPortal()
process = portal.get_process_by_name("Process Name")
id: str
27    @property
28    def id(self) -> str:
29        """Unique identifier"""
30        return self._data.id

Unique identifier

name: str
32    @property
33    def name(self) -> str:
34        """Readable name"""
35        return self._data.name

Readable name

description: str
37    @property
38    def description(self) -> str:
39        """Longer description of process"""
40        return self._data.description

Longer description of process

child_process_ids: List[str]
42    @property
43    def child_process_ids(self) -> List[str]:
44        """List of processes which can be run on the output of this process"""
45        return self._data.child_process_ids

List of processes which can be run on the output of this process

executor: cirro_api_client.v1.models.Executor
47    @property
48    def executor(self) -> Executor:
49        """INGEST, CROMWELL, or NEXTFLOW"""
50        return self._data.executor

INGEST, CROMWELL, or NEXTFLOW

category: str
52    @property
53    def category(self) -> str:
54        """Category of process"""
55        return self._data.category

Category of process

pipeline_type: str
57    @property
58    def pipeline_type(self) -> str:
59        """Pipeline type"""
60        return self._data.pipeline_type

Pipeline type

documentation_url: str
62    @property
63    def documentation_url(self) -> str:
64        """Documentation URL"""
65        return self._data.documentation_url

Documentation URL

file_requirements_message: str
67    @property
68    def file_requirements_message(self) -> str:
69        """Description of files required for INGEST processes"""
70        return self._data.file_requirements_message

Description of files required for INGEST processes

72    @property
73    def code(self) -> PipelineCode:
74        """Pipeline code configuration"""
75        return self._get_detail().pipeline_code

Pipeline code configuration

77    @property
78    def custom_settings(self) -> CustomPipelineSettings:
79        """Custom settings for the process"""
80        return self._get_detail().custom_settings

Custom settings for the process

def get_parameter_spec(self) -> cirro.models.form_specification.ParameterSpecification:
93    def get_parameter_spec(self) -> ParameterSpecification:
94        """
95        Gets a specification used to describe the parameters used in the process.
96        """
97        return self._client.processes.get_parameter_spec(self.id)

Gets a specification used to describe the parameters used in the process.

class DataPortalDataset(cirro.sdk.asset.DataPortalAsset):
 21class DataPortalDataset(DataPortalAsset):
 22    """
 23    Datasets in the Data Portal are collections of files which have
 24    either been uploaded directly, or which have been output by
 25    an analysis pipeline or notebook.
 26    """
 27
 28    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
 29        """
 30        Instantiate a dataset object
 31
 32        Should be invoked from a top-level constructor, for example:
 33
 34        ```python
 35        from cirro import DataPortal()
 36        portal = DataPortal()
 37        dataset = portal.get_dataset(
 38            project="id-or-name-of-project",
 39            dataset="id-or-name-of-dataset"
 40        )
 41        ```
 42
 43        """
 44        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
 45        self._data = dataset
 46        self._assets: Optional[DatasetAssets] = None
 47        self._client = client
 48
 49    @property
 50    def id(self) -> str:
 51        """Unique identifier for the dataset"""
 52        return self._data.id
 53
 54    @property
 55    def name(self) -> str:
 56        """Editable name for the dataset"""
 57        return self._data.name
 58
 59    @property
 60    def description(self) -> str:
 61        """Longer name for the dataset"""
 62        return self._data.description
 63
 64    @property
 65    def process_id(self) -> str:
 66        """Unique ID of process used to create the dataset"""
 67        return self._data.process_id
 68
 69    @property
 70    def process(self) -> ProcessDetail:
 71        """
 72        Object representing the process used to create the dataset
 73        """
 74        return self._client.processes.get(self.process_id)
 75
 76    @property
 77    def project_id(self) -> str:
 78        """ID of the project containing the dataset"""
 79        return self._data.project_id
 80
 81    @property
 82    def status(self) -> Status:
 83        """
 84        Status of the dataset
 85        """
 86        return self._data.status
 87
 88    @property
 89    def source_dataset_ids(self) -> List[str]:
 90        """IDs of the datasets used as sources for this dataset (if any)"""
 91        return self._data.source_dataset_ids
 92
 93    @property
 94    def source_datasets(self) -> List['DataPortalDataset']:
 95        """
 96        Objects representing the datasets used as sources for this dataset (if any)
 97        """
 98        return [
 99            DataPortalDataset(
100                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
101                client=self._client
102            )
103            for dataset_id in self.source_dataset_ids
104        ]
105
106    @property
107    def params(self) -> dict:
108        """
109        Parameters used to generate the dataset
110        """
111        return self._get_detail().params.to_dict()
112
113    @property
114    def info(self) -> dict:
115        """
116        Extra information about the dataset
117        """
118        return self._get_detail().info.to_dict()
119
120    @property
121    def tags(self) -> List[Tag]:
122        """
123        Tags applied to the dataset
124        """
125        return self._data.tags
126
127    @property
128    def share(self) -> Optional[NamedItem]:
129        """
130        Share associated with the dataset, if any.
131        """
132        return self._get_detail().share
133
134    @property
135    def created_by(self) -> str:
136        """User who created the dataset"""
137        return self._data.created_by
138
139    @property
140    def created_at(self) -> datetime.datetime:
141        """Timestamp of dataset creation"""
142        return self._data.created_at
143
144    def _get_detail(self):
145        if not isinstance(self._data, DatasetDetail):
146            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
147        return self._data
148
149    def _get_assets(self):
150        if not self._assets:
151            self._assets = self._client.datasets.get_assets_listing(
152                project_id=self.project_id,
153                dataset_id=self.id
154            )
155        return self._assets
156
157    def __str__(self):
158        return '\n'.join([
159            f"{i.title()}: {self.__getattribute__(i)}"
160            for i in ['name', 'id', 'description', 'status']
161        ])
162
163    def get_file(self, relative_path: str) -> DataPortalFile:
164        """
165        Get a file from the dataset using its relative path.
166
167        Args:
168            relative_path (str): Relative path of file within the dataset
169
170        Returns:
171            `from cirro.sdk.file import DataPortalFile`
172        """
173
174        # Get the list of files in this dataset
175        files = self.list_files()
176
177        # Try getting the file using the relative path provided by the user
178        try:
179            return files.get_by_id(relative_path)
180        except DataPortalAssetNotFound:
181            # Try getting the file with the 'data/' prefix prepended
182            try:
183                return files.get_by_id("data/" + relative_path)
184            except DataPortalAssetNotFound:
185                # If not found, raise the exception using the string provided
186                # by the user, not the data/ prepended version (which may be
187                # confusing to the user)
188                msg = '\n'.join([f"No file found with path '{relative_path}'."])
189                raise DataPortalAssetNotFound(msg)
190
191    def list_files(self) -> DataPortalFiles:
192        """
193        Return the list of files which make up the dataset.
194        """
195        files = self._get_assets().files
196        return DataPortalFiles(
197            [
198                DataPortalFile(file=file, client=self._client)
199                for file in files
200            ]
201        )
202
203    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
204        """
205        Get the artifact of a particular type from the dataset
206        """
207        artifacts = self._get_assets().artifacts
208        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
209        if artifact is None:
210            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
211        return DataPortalFile(file=artifact.file, client=self._client)
212
213    def list_artifacts(self) -> List[DataPortalFile]:
214        """
215        Return the list of artifacts associated with the dataset
216
217        An artifact may be something generated as part of the analysis or other process.
218        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
219
220        """
221        artifacts = self._get_assets().artifacts
222        return DataPortalFiles(
223            [
224                DataPortalFile(file=artifact.file, client=self._client)
225                for artifact in artifacts
226            ]
227        )
228
229    def download_files(self, download_location: str = None) -> None:
230        """
231        Download all the files from the dataset to a local directory.
232
233        Args:
234            download_location (str): Path to local directory
235        """
236
237        # Alias for internal method
238        self.list_files().download(download_location)
239
240    def run_analysis(
241            self,
242            name: str = None,
243            description: str = "",
244            process: Union[DataPortalProcess, str] = None,
245            params=None,
246            notifications_emails: List[str] = None,
247            compute_environment: str = None,
248            resume_dataset_id: str = None
249    ) -> str:
250        """
251        Runs an analysis on a dataset, returns the ID of the newly created dataset.
252
253        The process can be provided as either a DataPortalProcess object,
254        or a string which corresponds to the name or ID of the process.
255
256        Args:
257            name (str): Name of newly created dataset
258            description (str): Description of newly created dataset
259            process (DataPortalProcess or str): Process to run
260            params (dict): Analysis parameters
261            notifications_emails (List[str]): Notification email address(es)
262            compute_environment (str): Name or ID of compute environment to use,
263             if blank it will run in AWS
264            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
265             It will attempt to re-use the previous output to minimize duplicate work
266
267        Returns:
268            dataset_id (str): ID of newly created dataset
269        """
270        if name is None:
271            raise DataPortalInputError("Must specify 'name' for run_analysis")
272        if process is None:
273            raise DataPortalInputError("Must specify 'process' for run_analysis")
274        if notifications_emails is None:
275            notifications_emails = []
276        if params is None:
277            params = {}
278
279        # If the process is a string, try to parse it as a process name or ID
280        process = parse_process_name_or_id(process, self._client)
281
282        if compute_environment:
283            compute_environments = self._client.compute_environments.list_environments_for_project(
284                project_id=self.project_id
285            )
286            compute_environment = next(
287                (env for env in compute_environments
288                 if env.name == compute_environment or env.id == compute_environment),
289                None
290            )
291            if compute_environment is None:
292                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
293
294        resp = self._client.execution.run_analysis(
295            project_id=self.project_id,
296            request=RunAnalysisRequest(
297                name=name,
298                description=description,
299                process_id=process.id,
300                source_dataset_ids=[self.id],
301                params=RunAnalysisRequestParams.from_dict(params),
302                notification_emails=notifications_emails,
303                resume_dataset_id=resume_dataset_id,
304                compute_environment_id=compute_environment.id if compute_environment else None
305            )
306        )
307        return resp.id
308
309    def update_samplesheet(self,
310                           contents: str = None,
311                           file_path: PathLike = None):
312        """
313        Updates the samplesheet metadata of a dataset.
314        Provide either the contents (as a string) or a file path.
315        Both must be in the format of a CSV.
316
317        Args:
318            contents (str): Samplesheet contents to update (should be a CSV string)
319            file_path (PathLike): Path of file to update (should be a CSV file)
320
321        Example:
322        ```python
323        dataset.update_samplesheet(
324            file_path=Path('~/samplesheet.csv')
325        )
326        ```
327        """
328
329        if contents is None and file_path is None:
330            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
331
332        if self.process.executor != Executor.INGEST:
333            raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset")
334
335        samplesheet_contents = contents
336        if file_path is not None:
337            samplesheet_contents = Path(file_path).expanduser().read_text()
338
339        # Validate samplesheet
340        file_names = [f.file_name for f in self.list_files()]
341        request = ValidateFileRequirementsRequest(
342            file_names=file_names,
343            sample_sheet=samplesheet_contents,
344        )
345        requirements = validate_file_requirements.sync(process_id=self.process_id,
346                                                       body=request,
347                                                       client=self._client.api_client)
348        if error_msg := requirements.error_msg:
349            raise DataPortalInputError(error_msg)
350
351        # Update the samplesheet if everything looks ok
352        self._client.datasets.update_samplesheet(
353            project_id=self.project_id,
354            dataset_id=self.id,
355            samplesheet=samplesheet_contents
356        )

Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.

DataPortalDataset( dataset: Union[cirro_api_client.v1.models.Dataset, cirro_api_client.v1.models.DatasetDetail], client: CirroApi)
28    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
29        """
30        Instantiate a dataset object
31
32        Should be invoked from a top-level constructor, for example:
33
34        ```python
35        from cirro import DataPortal()
36        portal = DataPortal()
37        dataset = portal.get_dataset(
38            project="id-or-name-of-project",
39            dataset="id-or-name-of-dataset"
40        )
41        ```
42
43        """
44        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
45        self._data = dataset
46        self._assets: Optional[DatasetAssets] = None
47        self._client = client

Instantiate a dataset object

Should be invoked from a top-level constructor, for example:

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
id: str
49    @property
50    def id(self) -> str:
51        """Unique identifier for the dataset"""
52        return self._data.id

Unique identifier for the dataset

name: str
54    @property
55    def name(self) -> str:
56        """Editable name for the dataset"""
57        return self._data.name

Editable name for the dataset

description: str
59    @property
60    def description(self) -> str:
61        """Longer name for the dataset"""
62        return self._data.description

Longer name for the dataset

process_id: str
64    @property
65    def process_id(self) -> str:
66        """Unique ID of process used to create the dataset"""
67        return self._data.process_id

Unique ID of process used to create the dataset

69    @property
70    def process(self) -> ProcessDetail:
71        """
72        Object representing the process used to create the dataset
73        """
74        return self._client.processes.get(self.process_id)

Object representing the process used to create the dataset

project_id: str
76    @property
77    def project_id(self) -> str:
78        """ID of the project containing the dataset"""
79        return self._data.project_id

ID of the project containing the dataset

status: cirro_api_client.v1.models.Status
81    @property
82    def status(self) -> Status:
83        """
84        Status of the dataset
85        """
86        return self._data.status

Status of the dataset

source_dataset_ids: List[str]
88    @property
89    def source_dataset_ids(self) -> List[str]:
90        """IDs of the datasets used as sources for this dataset (if any)"""
91        return self._data.source_dataset_ids

IDs of the datasets used as sources for this dataset (if any)

source_datasets: List[DataPortalDataset]
 93    @property
 94    def source_datasets(self) -> List['DataPortalDataset']:
 95        """
 96        Objects representing the datasets used as sources for this dataset (if any)
 97        """
 98        return [
 99            DataPortalDataset(
100                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
101                client=self._client
102            )
103            for dataset_id in self.source_dataset_ids
104        ]

Objects representing the datasets used as sources for this dataset (if any)

params: dict
106    @property
107    def params(self) -> dict:
108        """
109        Parameters used to generate the dataset
110        """
111        return self._get_detail().params.to_dict()

Parameters used to generate the dataset

info: dict
113    @property
114    def info(self) -> dict:
115        """
116        Extra information about the dataset
117        """
118        return self._get_detail().info.to_dict()

Extra information about the dataset

tags: List[cirro_api_client.v1.models.Tag]
120    @property
121    def tags(self) -> List[Tag]:
122        """
123        Tags applied to the dataset
124        """
125        return self._data.tags

Tags applied to the dataset

share: Optional[cirro_api_client.v1.models.NamedItem]
127    @property
128    def share(self) -> Optional[NamedItem]:
129        """
130        Share associated with the dataset, if any.
131        """
132        return self._get_detail().share

Share associated with the dataset, if any.

created_by: str
134    @property
135    def created_by(self) -> str:
136        """User who created the dataset"""
137        return self._data.created_by

User who created the dataset

created_at: datetime.datetime
139    @property
140    def created_at(self) -> datetime.datetime:
141        """Timestamp of dataset creation"""
142        return self._data.created_at

Timestamp of dataset creation

def get_file(self, relative_path: str) -> cirro.sdk.file.DataPortalFile:
163    def get_file(self, relative_path: str) -> DataPortalFile:
164        """
165        Get a file from the dataset using its relative path.
166
167        Args:
168            relative_path (str): Relative path of file within the dataset
169
170        Returns:
171            `from cirro.sdk.file import DataPortalFile`
172        """
173
174        # Get the list of files in this dataset
175        files = self.list_files()
176
177        # Try getting the file using the relative path provided by the user
178        try:
179            return files.get_by_id(relative_path)
180        except DataPortalAssetNotFound:
181            # Try getting the file with the 'data/' prefix prepended
182            try:
183                return files.get_by_id("data/" + relative_path)
184            except DataPortalAssetNotFound:
185                # If not found, raise the exception using the string provided
186                # by the user, not the data/ prepended version (which may be
187                # confusing to the user)
188                msg = '\n'.join([f"No file found with path '{relative_path}'."])
189                raise DataPortalAssetNotFound(msg)

Get a file from the dataset using its relative path.

Arguments:
  • relative_path (str): Relative path of file within the dataset
Returns:

from cirro.sdk.file import DataPortalFile

def list_files(self) -> cirro.sdk.file.DataPortalFiles:
191    def list_files(self) -> DataPortalFiles:
192        """
193        Return the list of files which make up the dataset.
194        """
195        files = self._get_assets().files
196        return DataPortalFiles(
197            [
198                DataPortalFile(file=file, client=self._client)
199                for file in files
200            ]
201        )

Return the list of files which make up the dataset.

def get_artifact( self, artifact_type: cirro_api_client.v1.models.ArtifactType) -> cirro.sdk.file.DataPortalFile:
203    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
204        """
205        Get the artifact of a particular type from the dataset
206        """
207        artifacts = self._get_assets().artifacts
208        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
209        if artifact is None:
210            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
211        return DataPortalFile(file=artifact.file, client=self._client)

Get the artifact of a particular type from the dataset

def list_artifacts(self) -> List[cirro.sdk.file.DataPortalFile]:
213    def list_artifacts(self) -> List[DataPortalFile]:
214        """
215        Return the list of artifacts associated with the dataset
216
217        An artifact may be something generated as part of the analysis or other process.
218        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
219
220        """
221        artifacts = self._get_assets().artifacts
222        return DataPortalFiles(
223            [
224                DataPortalFile(file=artifact.file, client=self._client)
225                for artifact in artifacts
226            ]
227        )

Return the list of artifacts associated with the dataset

An artifact may be something generated as part of the analysis or other process. See cirro_api_client.v1.models.ArtifactType for the list of possible artifact types.

def download_files(self, download_location: str = None) -> None:
229    def download_files(self, download_location: str = None) -> None:
230        """
231        Download all the files from the dataset to a local directory.
232
233        Args:
234            download_location (str): Path to local directory
235        """
236
237        # Alias for internal method
238        self.list_files().download(download_location)

Download all the files from the dataset to a local directory.

Arguments:
  • download_location (str): Path to local directory
def run_analysis( self, name: str = None, description: str = '', process: Union[DataPortalProcess, str] = None, params=None, notifications_emails: List[str] = None, compute_environment: str = None, resume_dataset_id: str = None) -> str:
240    def run_analysis(
241            self,
242            name: str = None,
243            description: str = "",
244            process: Union[DataPortalProcess, str] = None,
245            params=None,
246            notifications_emails: List[str] = None,
247            compute_environment: str = None,
248            resume_dataset_id: str = None
249    ) -> str:
250        """
251        Runs an analysis on a dataset, returns the ID of the newly created dataset.
252
253        The process can be provided as either a DataPortalProcess object,
254        or a string which corresponds to the name or ID of the process.
255
256        Args:
257            name (str): Name of newly created dataset
258            description (str): Description of newly created dataset
259            process (DataPortalProcess or str): Process to run
260            params (dict): Analysis parameters
261            notifications_emails (List[str]): Notification email address(es)
262            compute_environment (str): Name or ID of compute environment to use,
263             if blank it will run in AWS
264            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
265             It will attempt to re-use the previous output to minimize duplicate work
266
267        Returns:
268            dataset_id (str): ID of newly created dataset
269        """
270        if name is None:
271            raise DataPortalInputError("Must specify 'name' for run_analysis")
272        if process is None:
273            raise DataPortalInputError("Must specify 'process' for run_analysis")
274        if notifications_emails is None:
275            notifications_emails = []
276        if params is None:
277            params = {}
278
279        # If the process is a string, try to parse it as a process name or ID
280        process = parse_process_name_or_id(process, self._client)
281
282        if compute_environment:
283            compute_environments = self._client.compute_environments.list_environments_for_project(
284                project_id=self.project_id
285            )
286            compute_environment = next(
287                (env for env in compute_environments
288                 if env.name == compute_environment or env.id == compute_environment),
289                None
290            )
291            if compute_environment is None:
292                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
293
294        resp = self._client.execution.run_analysis(
295            project_id=self.project_id,
296            request=RunAnalysisRequest(
297                name=name,
298                description=description,
299                process_id=process.id,
300                source_dataset_ids=[self.id],
301                params=RunAnalysisRequestParams.from_dict(params),
302                notification_emails=notifications_emails,
303                resume_dataset_id=resume_dataset_id,
304                compute_environment_id=compute_environment.id if compute_environment else None
305            )
306        )
307        return resp.id

Runs an analysis on a dataset, returns the ID of the newly created dataset.

The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.

Arguments:
  • name (str): Name of newly created dataset
  • description (str): Description of newly created dataset
  • process (DataPortalProcess or str): Process to run
  • params (dict): Analysis parameters
  • notifications_emails (List[str]): Notification email address(es)
  • compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
  • resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
Returns:

dataset_id (str): ID of newly created dataset

def update_samplesheet(self, contents: str = None, file_path: ~PathLike = None):
309    def update_samplesheet(self,
310                           contents: str = None,
311                           file_path: PathLike = None):
312        """
313        Updates the samplesheet metadata of a dataset.
314        Provide either the contents (as a string) or a file path.
315        Both must be in the format of a CSV.
316
317        Args:
318            contents (str): Samplesheet contents to update (should be a CSV string)
319            file_path (PathLike): Path of file to update (should be a CSV file)
320
321        Example:
322        ```python
323        dataset.update_samplesheet(
324            file_path=Path('~/samplesheet.csv')
325        )
326        ```
327        """
328
329        if contents is None and file_path is None:
330            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
331
332        if self.process.executor != Executor.INGEST:
333            raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset")
334
335        samplesheet_contents = contents
336        if file_path is not None:
337            samplesheet_contents = Path(file_path).expanduser().read_text()
338
339        # Validate samplesheet
340        file_names = [f.file_name for f in self.list_files()]
341        request = ValidateFileRequirementsRequest(
342            file_names=file_names,
343            sample_sheet=samplesheet_contents,
344        )
345        requirements = validate_file_requirements.sync(process_id=self.process_id,
346                                                       body=request,
347                                                       client=self._client.api_client)
348        if error_msg := requirements.error_msg:
349            raise DataPortalInputError(error_msg)
350
351        # Update the samplesheet if everything looks ok
352        self._client.datasets.update_samplesheet(
353            project_id=self.project_id,
354            dataset_id=self.id,
355            samplesheet=samplesheet_contents
356        )

Updates the samplesheet metadata of a dataset. Provide either the contents (as a string) or a file path. Both must be in the format of a CSV.

Arguments:
  • contents (str): Samplesheet contents to update (should be a CSV string)
  • file_path (PathLike): Path of file to update (should be a CSV file)

Example:

dataset.update_samplesheet(
    file_path=Path('~/samplesheet.csv')
)
class DataPortalReference(cirro.sdk.asset.DataPortalAsset):
12class DataPortalReference(DataPortalAsset):
13    """
14    Reference data object containing files which can be used for analysis in a particular project.
15    """
16    def __init__(self, ref: Reference, project_id: str, client: CirroApi):
17        """
18        Instantiate by listing the references which have been added to a particular project
19        ```python
20        from cirro import DataPortal()
21        portal = DataPortal()
22        project = portal.get_project_by_name("Project Name")
23        references = project.list_references()
24        ```
25        """
26        self._data = ref
27        self._files = [
28            DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files
29        ]
30
31    @property
32    def files(self) -> List[DataPortalFile]:
33        """File(s) contained in the reference"""
34        return self._files
35
36    @property
37    def name(self) -> str:
38        """Reference name"""
39        return self._data.name
40
41    @property
42    def type(self) -> str:
43        """Type of reference data (e.g. genome_fasta)"""
44        return self._data.type
45
46    @property
47    def absolute_path(self):
48        if len(self._files) == 0:
49            return None
50        return self._files[0].absolute_path
51
52    def __str__(self):
53        return self.name

Reference data object containing files which can be used for analysis in a particular project.

DataPortalReference( ref: cirro_api_client.v1.models.Reference, project_id: str, client: CirroApi)
16    def __init__(self, ref: Reference, project_id: str, client: CirroApi):
17        """
18        Instantiate by listing the references which have been added to a particular project
19        ```python
20        from cirro import DataPortal()
21        portal = DataPortal()
22        project = portal.get_project_by_name("Project Name")
23        references = project.list_references()
24        ```
25        """
26        self._data = ref
27        self._files = [
28            DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files
29        ]

Instantiate by listing the references which have been added to a particular project

from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
references = project.list_references()
files: List[cirro.sdk.file.DataPortalFile]
31    @property
32    def files(self) -> List[DataPortalFile]:
33        """File(s) contained in the reference"""
34        return self._files

File(s) contained in the reference

name: str
36    @property
37    def name(self) -> str:
38        """Reference name"""
39        return self._data.name

Reference name

type: str
41    @property
42    def type(self) -> str:
43        """Type of reference data (e.g. genome_fasta)"""
44        return self._data.type

Type of reference data (e.g. genome_fasta)

absolute_path
46    @property
47    def absolute_path(self):
48        if len(self._files) == 0:
49            return None
50        return self._files[0].absolute_path
class CirroApi:
 12class CirroApi:
 13    """
 14    Client for interacting directly with the Cirro API
 15    """
 16    def __init__(self, auth_info: AuthInfo = None, base_url: str = None):
 17        """
 18        Instantiates the Cirro API object
 19
 20        Args:
 21            auth_info (cirro.auth.base.AuthInfo):
 22            base_url (str): Optional base URL of the Cirro instance
 23             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
 24
 25        Returns:
 26            Authenticated Cirro API object, which can be used to call endpoint functions.
 27
 28        Example:
 29        ```python
 30        from cirro.cirro_client import CirroApi
 31
 32        cirro = CirroApi(base_url="app.cirro.bio")
 33        print(cirro.projects.list())
 34        ```
 35        """
 36
 37        self._configuration = AppConfig(base_url=base_url)
 38        if not auth_info:
 39            auth_info = get_auth_info_from_config(self._configuration, auth_io=None)
 40
 41        self._api_client = CirroApiClient(
 42            base_url=self._configuration.rest_endpoint,
 43            auth_method=auth_info.get_auth_method(),
 44            client_name='Cirro SDK',
 45            package_name='cirro'
 46        )
 47
 48        # Init services
 49        self._file_service = FileService(self._api_client,
 50                                         checksum_method=self._configuration.checksum_method,
 51                                         transfer_retries=self._configuration.transfer_max_retries)
 52        self._dataset_service = DatasetService(self._api_client, file_service=self._file_service)
 53        self._project_service = ProjectService(self._api_client)
 54        self._process_service = ProcessService(self._api_client)
 55        self._execution_service = ExecutionService(self._api_client)
 56        self._compute_environment_service = ComputeEnvironmentService(self._api_client)
 57        self._metrics_service = MetricsService(self._api_client)
 58        self._metadata_service = MetadataService(self._api_client)
 59        self._billing_service = BillingService(self._api_client)
 60        self._references_service = ReferenceService(self._api_client, file_service=self._file_service)
 61        self._shares_service = ShareService(self._api_client)
 62        self._users_service = UserService(self._api_client)
 63
 64    @property
 65    def datasets(self) -> DatasetService:
 66        """
 67        Create, list, delete, and modify Datasets
 68        """
 69        return self._dataset_service
 70
 71    @property
 72    def projects(self) -> ProjectService:
 73        """
 74        Create, list, delete, and modify Projects
 75        """
 76        return self._project_service
 77
 78    @property
 79    def processes(self) -> ProcessService:
 80        """
 81        List and retrieve detailed information about Processes
 82        """
 83        return self._process_service
 84
 85    @property
 86    def execution(self) -> ExecutionService:
 87        """
 88        List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)
 89        """
 90        return self._execution_service
 91
 92    @property
 93    def compute_environments(self) -> ComputeEnvironmentService:
 94        """
 95        List and update compute environments
 96        """
 97        return self._compute_environment_service
 98
 99    @property
100    def metrics(self) -> MetricsService:
101        """
102        Project-level summary metrics
103        """
104        return self._metrics_service
105
106    @property
107    def metadata(self) -> MetadataService:
108        """
109        List and modify Sample metadata or metadata schemas
110        """
111        return self._metadata_service
112
113    @property
114    def billing(self) -> BillingService:
115        """
116        List and update billing accounts
117        """
118        return self._billing_service
119
120    @property
121    def references(self) -> ReferenceService:
122        """
123        List References and Reference types
124        """
125        return self._references_service
126
127    @property
128    def shares(self) -> ShareService:
129        """
130        List, create, update, delete, and subscribe to shares
131        """
132        return self._shares_service
133
134    @property
135    def users(self) -> UserService:
136        """
137        List and update user information
138        """
139        return self._users_service
140
141    @property
142    def file(self) -> FileService:
143        """
144        Read, download, and create file objects
145        """
146        return self._file_service
147
148    @property
149    def api_client(self) -> CirroApiClient:
150        """
151        Gets the underlying API client
152        """
153        return self._api_client
154
155    @property
156    def configuration(self) -> AppConfig:
157        """
158        Gets the configuration of the instance
159        """
160        return self._configuration

Client for interacting directly with the Cirro API

CirroApi(auth_info: cirro.auth.base.AuthInfo = None, base_url: str = None)
16    def __init__(self, auth_info: AuthInfo = None, base_url: str = None):
17        """
18        Instantiates the Cirro API object
19
20        Args:
21            auth_info (cirro.auth.base.AuthInfo):
22            base_url (str): Optional base URL of the Cirro instance
23             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
24
25        Returns:
26            Authenticated Cirro API object, which can be used to call endpoint functions.
27
28        Example:
29        ```python
30        from cirro.cirro_client import CirroApi
31
32        cirro = CirroApi(base_url="app.cirro.bio")
33        print(cirro.projects.list())
34        ```
35        """
36
37        self._configuration = AppConfig(base_url=base_url)
38        if not auth_info:
39            auth_info = get_auth_info_from_config(self._configuration, auth_io=None)
40
41        self._api_client = CirroApiClient(
42            base_url=self._configuration.rest_endpoint,
43            auth_method=auth_info.get_auth_method(),
44            client_name='Cirro SDK',
45            package_name='cirro'
46        )
47
48        # Init services
49        self._file_service = FileService(self._api_client,
50                                         checksum_method=self._configuration.checksum_method,
51                                         transfer_retries=self._configuration.transfer_max_retries)
52        self._dataset_service = DatasetService(self._api_client, file_service=self._file_service)
53        self._project_service = ProjectService(self._api_client)
54        self._process_service = ProcessService(self._api_client)
55        self._execution_service = ExecutionService(self._api_client)
56        self._compute_environment_service = ComputeEnvironmentService(self._api_client)
57        self._metrics_service = MetricsService(self._api_client)
58        self._metadata_service = MetadataService(self._api_client)
59        self._billing_service = BillingService(self._api_client)
60        self._references_service = ReferenceService(self._api_client, file_service=self._file_service)
61        self._shares_service = ShareService(self._api_client)
62        self._users_service = UserService(self._api_client)

Instantiates the Cirro API object

Arguments:
  • auth_info (cirro.auth.base.AuthInfo):
  • base_url (str): Optional base URL of the Cirro instance (if not provided, it uses the CIRRO_BASE_URL environment variable, or the config file)
Returns:

Authenticated Cirro API object, which can be used to call endpoint functions.

Example:

from cirro.cirro_client import CirroApi

cirro = CirroApi(base_url="app.cirro.bio")
print(cirro.projects.list())
datasets: cirro.services.DatasetService
64    @property
65    def datasets(self) -> DatasetService:
66        """
67        Create, list, delete, and modify Datasets
68        """
69        return self._dataset_service

Create, list, delete, and modify Datasets

projects: cirro.services.ProjectService
71    @property
72    def projects(self) -> ProjectService:
73        """
74        Create, list, delete, and modify Projects
75        """
76        return self._project_service

Create, list, delete, and modify Projects

processes: cirro.services.ProcessService
78    @property
79    def processes(self) -> ProcessService:
80        """
81        List and retrieve detailed information about Processes
82        """
83        return self._process_service

List and retrieve detailed information about Processes

execution: cirro.services.ExecutionService
85    @property
86    def execution(self) -> ExecutionService:
87        """
88        List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)
89        """
90        return self._execution_service

List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)

compute_environments: cirro.services.ComputeEnvironmentService
92    @property
93    def compute_environments(self) -> ComputeEnvironmentService:
94        """
95        List and update compute environments
96        """
97        return self._compute_environment_service

List and update compute environments

metrics: cirro.services.MetricsService
 99    @property
100    def metrics(self) -> MetricsService:
101        """
102        Project-level summary metrics
103        """
104        return self._metrics_service

Project-level summary metrics

metadata: cirro.services.MetadataService
106    @property
107    def metadata(self) -> MetadataService:
108        """
109        List and modify Sample metadata or metadata schemas
110        """
111        return self._metadata_service

List and modify Sample metadata or metadata schemas

billing: cirro.services.BillingService
113    @property
114    def billing(self) -> BillingService:
115        """
116        List and update billing accounts
117        """
118        return self._billing_service

List and update billing accounts

references: cirro.services.ReferenceService
120    @property
121    def references(self) -> ReferenceService:
122        """
123        List References and Reference types
124        """
125        return self._references_service

List References and Reference types

shares: cirro.services.ShareService
127    @property
128    def shares(self) -> ShareService:
129        """
130        List, create, update, delete, and subscribe to shares
131        """
132        return self._shares_service

List, create, update, delete, and subscribe to shares

users: cirro.services.UserService
134    @property
135    def users(self) -> UserService:
136        """
137        List and update user information
138        """
139        return self._users_service

List and update user information

file: cirro.services.FileService
141    @property
142    def file(self) -> FileService:
143        """
144        Read, download, and create file objects
145        """
146        return self._file_service

Read, download, and create file objects

api_client: cirro_api_client.CirroApiClient
148    @property
149    def api_client(self) -> CirroApiClient:
150        """
151        Gets the underlying API client
152        """
153        return self._api_client

Gets the underlying API client

configuration: cirro.config.AppConfig
155    @property
156    def configuration(self) -> AppConfig:
157        """
158        Gets the configuration of the instance
159        """
160        return self._configuration

Gets the configuration of the instance