cirro
1import cirro.file_utils # noqa 2from cirro.cirro_client import CirroApi 3from cirro.sdk.dataset import DataPortalDataset 4from cirro.sdk.portal import DataPortal 5from cirro.sdk.process import DataPortalProcess 6from cirro.sdk.project import DataPortalProject 7from cirro.sdk.reference import DataPortalReference 8 9__all__ = [ 10 'DataPortal', 11 'DataPortalProject', 12 'DataPortalProcess', 13 'DataPortalDataset', 14 'DataPortalReference', 15 'CirroApi', 16 'file_utils' 17]
12class DataPortal: 13 """ 14 Helper functions for exploring the Projects, Datasets, Samples, and Files 15 available in the Data Portal. 16 """ 17 18 def __init__(self, client: CirroApi = None): 19 """Set up the DataPortal object, establishing an authenticated connection.""" 20 21 if client is not None: 22 self._client = client 23 24 # Set up default client if not provided 25 else: 26 self._client = CirroApi() 27 28 def list_projects(self) -> DataPortalProjects: 29 """List all the projects available in the Data Portal.""" 30 31 return DataPortalProjects( 32 [ 33 DataPortalProject(proj, self._client) 34 for proj in self._client.projects.list() 35 ] 36 ) 37 38 def get_project_by_name(self, name: str = None) -> DataPortalProject: 39 """Return the project with the specified name.""" 40 41 return self.list_projects().get_by_name(name) 42 43 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 44 """Return the project with the specified id.""" 45 46 return self.list_projects().get_by_id(_id) 47 48 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 49 """ 50 Return a dataset identified by ID or name. 51 52 Args: 53 project (str): ID or name of project 54 dataset (str): ID or name of dataset 55 56 Returns: 57 `cirro.sdk.dataset.DataPortalDataset` 58 59 ```python 60 from cirro import DataPortal() 61 portal = DataPortal() 62 dataset = portal.get_dataset( 63 project="id-or-name-of-project", 64 dataset="id-or-name-of-dataset" 65 ) 66 ``` 67 """ 68 try: 69 project: DataPortalProject = self.get_project_by_id(project) 70 except DataPortalAssetNotFound: 71 project: DataPortalProject = self.get_project_by_name(project) 72 73 try: 74 return project.get_dataset_by_id(dataset) 75 except DataPortalAssetNotFound: 76 return project.get_dataset_by_name(dataset) 77 78 def list_processes(self, ingest=False) -> DataPortalProcesses: 79 """ 80 List all the processes available in the Data Portal. 81 By default, only list non-ingest processes (those which can be run on existing datasets). 82 To list the processes which can be used to upload datasets, use `ingest = True`. 83 84 Args: 85 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 86 """ 87 88 return DataPortalProcesses( 89 [ 90 DataPortalProcess(p, self._client) 91 for p in self._client.processes.list() 92 if not ingest or p.executor == Executor.INGEST 93 ] 94 ) 95 96 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 97 """ 98 Return the process with the specified name. 99 100 Args: 101 name (str): Name of process 102 """ 103 104 return self.list_processes(ingest=ingest).get_by_name(name) 105 106 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 107 """ 108 Return the process with the specified id 109 110 Args: 111 id (str): ID of process 112 """ 113 114 return self.list_processes(ingest=ingest).get_by_id(id) 115 116 def list_reference_types(self) -> DataPortalReferenceTypes: 117 """ 118 Return the list of all available reference types 119 """ 120 121 return DataPortalReferenceTypes( 122 [ 123 DataPortalReferenceType(ref) 124 for ref in self._client.references.get_types() 125 ] 126 )
Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.
18 def __init__(self, client: CirroApi = None): 19 """Set up the DataPortal object, establishing an authenticated connection.""" 20 21 if client is not None: 22 self._client = client 23 24 # Set up default client if not provided 25 else: 26 self._client = CirroApi()
Set up the DataPortal object, establishing an authenticated connection.
28 def list_projects(self) -> DataPortalProjects: 29 """List all the projects available in the Data Portal.""" 30 31 return DataPortalProjects( 32 [ 33 DataPortalProject(proj, self._client) 34 for proj in self._client.projects.list() 35 ] 36 )
List all the projects available in the Data Portal.
38 def get_project_by_name(self, name: str = None) -> DataPortalProject: 39 """Return the project with the specified name.""" 40 41 return self.list_projects().get_by_name(name)
Return the project with the specified name.
43 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 44 """Return the project with the specified id.""" 45 46 return self.list_projects().get_by_id(_id)
Return the project with the specified id.
48 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 49 """ 50 Return a dataset identified by ID or name. 51 52 Args: 53 project (str): ID or name of project 54 dataset (str): ID or name of dataset 55 56 Returns: 57 `cirro.sdk.dataset.DataPortalDataset` 58 59 ```python 60 from cirro import DataPortal() 61 portal = DataPortal() 62 dataset = portal.get_dataset( 63 project="id-or-name-of-project", 64 dataset="id-or-name-of-dataset" 65 ) 66 ``` 67 """ 68 try: 69 project: DataPortalProject = self.get_project_by_id(project) 70 except DataPortalAssetNotFound: 71 project: DataPortalProject = self.get_project_by_name(project) 72 73 try: 74 return project.get_dataset_by_id(dataset) 75 except DataPortalAssetNotFound: 76 return project.get_dataset_by_name(dataset)
Return a dataset identified by ID or name.
Arguments:
- project (str): ID or name of project
- dataset (str): ID or name of dataset
Returns:
from cirro import DataPortal() portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", dataset="id-or-name-of-dataset" )
78 def list_processes(self, ingest=False) -> DataPortalProcesses: 79 """ 80 List all the processes available in the Data Portal. 81 By default, only list non-ingest processes (those which can be run on existing datasets). 82 To list the processes which can be used to upload datasets, use `ingest = True`. 83 84 Args: 85 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 86 """ 87 88 return DataPortalProcesses( 89 [ 90 DataPortalProcess(p, self._client) 91 for p in self._client.processes.list() 92 if not ingest or p.executor == Executor.INGEST 93 ] 94 )
List all the processes available in the Data Portal.
By default, only list non-ingest processes (those which can be run on existing datasets).
To list the processes which can be used to upload datasets, use ingest = True
.
Arguments:
- ingest (bool): If True, only list those processes which can be used to ingest datasets directly
96 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 97 """ 98 Return the process with the specified name. 99 100 Args: 101 name (str): Name of process 102 """ 103 104 return self.list_processes(ingest=ingest).get_by_name(name)
Return the process with the specified name.
Arguments:
- name (str): Name of process
106 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 107 """ 108 Return the process with the specified id 109 110 Args: 111 id (str): ID of process 112 """ 113 114 return self.list_processes(ingest=ingest).get_by_id(id)
Return the process with the specified id
Arguments:
- id (str): ID of process
116 def list_reference_types(self) -> DataPortalReferenceTypes: 117 """ 118 Return the list of all available reference types 119 """ 120 121 return DataPortalReferenceTypes( 122 [ 123 DataPortalReferenceType(ref) 124 for ref in self._client.references.get_types() 125 ] 126 )
Return the list of all available reference types
19class DataPortalProject(DataPortalAsset): 20 """ 21 Projects in the Data Portal contain collections of Datasets. 22 Users are granted permissions at the project-level, allowing them 23 to view and/or modify all the datasets in that collection. 24 """ 25 def __init__(self, proj: Project, client: CirroApi): 26 """ 27 Instantiate with helper method 28 29 ```python 30 from cirro import DataPortal() 31 portal = DataPortal() 32 project = portal.get_project_by_name("Project Name") 33 ``` 34 35 """ 36 self._data = proj 37 self._client = client 38 39 @property 40 def id(self) -> str: 41 """ 42 Unique identifier 43 """ 44 return self._data.id 45 46 @property 47 def name(self) -> str: 48 """ 49 Readable name 50 """ 51 return self._data.name 52 53 @property 54 def description(self) -> str: 55 """ 56 Longer description of the project 57 """ 58 return self._data.description 59 60 def __str__(self): 61 """Control how the Project is rendered as a string.""" 62 63 return '\n'.join([ 64 f"{i.title()}: {self.__getattribute__(i)}" 65 for i in ['name', 'id', 'description'] 66 ]) 67 68 @cache 69 def _get_datasets(self) -> List[Dataset]: 70 return self._client.datasets.list(self.id) 71 72 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 73 """List all the datasets available in the project.""" 74 if force_refresh: 75 self._get_datasets.cache_clear() 76 77 return DataPortalDatasets( 78 [ 79 DataPortalDataset(d, self._client) 80 for d in self._get_datasets() 81 ] 82 ) 83 84 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 85 """Return the dataset with the specified name.""" 86 if force_refresh: 87 self._get_datasets.cache_clear() 88 89 dataset = next((d for d in self._get_datasets() if d.name == name), None) 90 if dataset is None: 91 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 92 return self.get_dataset_by_id(dataset.id) 93 94 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 95 """Return the dataset with the specified id.""" 96 97 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 98 if dataset is None: 99 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 100 return DataPortalDataset(dataset, self._client) 101 102 def list_references(self, reference_type: str = None) -> DataPortalReferences: 103 """ 104 List the references available in a project. 105 Optionally filter to references of a particular type (identified by name) 106 """ 107 108 # Get the complete list of references which are available 109 reference_types = DataPortalReferenceTypes( 110 [ 111 DataPortalReferenceType(ref) 112 for ref in self._client.references.get_types() 113 ] 114 ) 115 116 # If a particular name was specified 117 if reference_type is not None: 118 reference_types = reference_types.filter_by_pattern(reference_type) 119 if len(reference_types) == 0: 120 msg = f"Could not find any reference types with the name {reference_type}" 121 raise DataPortalAssetNotFound(msg) 122 123 return DataPortalReferences( 124 [ 125 DataPortalReference(ref, project_id=self.id, client=self._client) 126 for ref in self._client.references.get_for_project( 127 self.id 128 ) 129 if reference_type is None or ref.type == reference_type 130 ] 131 ) 132 133 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 134 """Return the reference of a particular type with the specified name.""" 135 136 if name is None: 137 raise DataPortalInputError("Must specify the reference name") 138 139 return self.list_references(ref_type).get_by_name(name) 140 141 def upload_dataset( 142 self, 143 name: str = None, 144 description='', 145 process: Union[DataPortalProcess, str] = None, 146 upload_folder: str = None, 147 files: list = None 148 ): 149 """ 150 Upload a set of files to the Data Portal, creating a new dataset. 151 152 If the files parameter is not provided, it will upload all files in the upload folder 153 154 Args: 155 name (str): Name of newly created dataset 156 description (str): Description of newly created dataset 157 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 158 upload_folder (str): Folder containing files to upload 159 files (List[str]): Optional subset of files to upload from the folder 160 """ 161 162 if name is None: 163 raise DataPortalInputError("Must provide name for new dataset") 164 if process is None: 165 raise DataPortalInputError("Must provide the process which is used for ingest") 166 if upload_folder is None: 167 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 168 169 # Parse the process provided by the user 170 process = parse_process_name_or_id(process, self._client) 171 172 # If no files were provided 173 if files is None: 174 # Get the list of files in the upload folder 175 files = get_files_in_directory(upload_folder) 176 177 if files is None or len(files) == 0: 178 raise RuntimeWarning("No files to upload, exiting") 179 180 # Make sure that the files match the expected pattern 181 self._client.processes.check_dataset_files(files, process.id, upload_folder) 182 183 # Create the ingest process request 184 dataset_create_request = UploadDatasetRequest( 185 process_id=process.id, 186 name=name, 187 description=description, 188 expected_files=files 189 ) 190 191 # Get the response 192 create_response = self._client.datasets.create(project_id=self.id, 193 upload_request=dataset_create_request) 194 195 # Upload the files 196 self._client.datasets.upload_files( 197 project_id=self.id, 198 dataset_id=create_response.id, 199 local_directory=upload_folder, 200 files=files 201 ) 202 203 # Return the dataset which was created, which might take a second to update 204 max_attempts = 5 205 for attempt in range(max_attempts): 206 try: 207 return self.get_dataset_by_id(create_response.id) 208 except DataPortalAssetNotFound as e: 209 if attempt == max_attempts - 1: 210 raise e 211 else: 212 sleep(2)
Projects in the Data Portal contain collections of Datasets. Users are granted permissions at the project-level, allowing them to view and/or modify all the datasets in that collection.
25 def __init__(self, proj: Project, client: CirroApi): 26 """ 27 Instantiate with helper method 28 29 ```python 30 from cirro import DataPortal() 31 portal = DataPortal() 32 project = portal.get_project_by_name("Project Name") 33 ``` 34 35 """ 36 self._data = proj 37 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
53 @property 54 def description(self) -> str: 55 """ 56 Longer description of the project 57 """ 58 return self._data.description
Longer description of the project
72 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 73 """List all the datasets available in the project.""" 74 if force_refresh: 75 self._get_datasets.cache_clear() 76 77 return DataPortalDatasets( 78 [ 79 DataPortalDataset(d, self._client) 80 for d in self._get_datasets() 81 ] 82 )
List all the datasets available in the project.
84 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 85 """Return the dataset with the specified name.""" 86 if force_refresh: 87 self._get_datasets.cache_clear() 88 89 dataset = next((d for d in self._get_datasets() if d.name == name), None) 90 if dataset is None: 91 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 92 return self.get_dataset_by_id(dataset.id)
Return the dataset with the specified name.
94 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 95 """Return the dataset with the specified id.""" 96 97 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 98 if dataset is None: 99 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 100 return DataPortalDataset(dataset, self._client)
Return the dataset with the specified id.
102 def list_references(self, reference_type: str = None) -> DataPortalReferences: 103 """ 104 List the references available in a project. 105 Optionally filter to references of a particular type (identified by name) 106 """ 107 108 # Get the complete list of references which are available 109 reference_types = DataPortalReferenceTypes( 110 [ 111 DataPortalReferenceType(ref) 112 for ref in self._client.references.get_types() 113 ] 114 ) 115 116 # If a particular name was specified 117 if reference_type is not None: 118 reference_types = reference_types.filter_by_pattern(reference_type) 119 if len(reference_types) == 0: 120 msg = f"Could not find any reference types with the name {reference_type}" 121 raise DataPortalAssetNotFound(msg) 122 123 return DataPortalReferences( 124 [ 125 DataPortalReference(ref, project_id=self.id, client=self._client) 126 for ref in self._client.references.get_for_project( 127 self.id 128 ) 129 if reference_type is None or ref.type == reference_type 130 ] 131 )
List the references available in a project. Optionally filter to references of a particular type (identified by name)
133 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 134 """Return the reference of a particular type with the specified name.""" 135 136 if name is None: 137 raise DataPortalInputError("Must specify the reference name") 138 139 return self.list_references(ref_type).get_by_name(name)
Return the reference of a particular type with the specified name.
141 def upload_dataset( 142 self, 143 name: str = None, 144 description='', 145 process: Union[DataPortalProcess, str] = None, 146 upload_folder: str = None, 147 files: list = None 148 ): 149 """ 150 Upload a set of files to the Data Portal, creating a new dataset. 151 152 If the files parameter is not provided, it will upload all files in the upload folder 153 154 Args: 155 name (str): Name of newly created dataset 156 description (str): Description of newly created dataset 157 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 158 upload_folder (str): Folder containing files to upload 159 files (List[str]): Optional subset of files to upload from the folder 160 """ 161 162 if name is None: 163 raise DataPortalInputError("Must provide name for new dataset") 164 if process is None: 165 raise DataPortalInputError("Must provide the process which is used for ingest") 166 if upload_folder is None: 167 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 168 169 # Parse the process provided by the user 170 process = parse_process_name_or_id(process, self._client) 171 172 # If no files were provided 173 if files is None: 174 # Get the list of files in the upload folder 175 files = get_files_in_directory(upload_folder) 176 177 if files is None or len(files) == 0: 178 raise RuntimeWarning("No files to upload, exiting") 179 180 # Make sure that the files match the expected pattern 181 self._client.processes.check_dataset_files(files, process.id, upload_folder) 182 183 # Create the ingest process request 184 dataset_create_request = UploadDatasetRequest( 185 process_id=process.id, 186 name=name, 187 description=description, 188 expected_files=files 189 ) 190 191 # Get the response 192 create_response = self._client.datasets.create(project_id=self.id, 193 upload_request=dataset_create_request) 194 195 # Upload the files 196 self._client.datasets.upload_files( 197 project_id=self.id, 198 dataset_id=create_response.id, 199 local_directory=upload_folder, 200 files=files 201 ) 202 203 # Return the dataset which was created, which might take a second to update 204 max_attempts = 5 205 for attempt in range(max_attempts): 206 try: 207 return self.get_dataset_by_id(create_response.id) 208 except DataPortalAssetNotFound as e: 209 if attempt == max_attempts - 1: 210 raise e 211 else: 212 sleep(2)
Upload a set of files to the Data Portal, creating a new dataset.
If the files parameter is not provided, it will upload all files in the upload folder
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
- upload_folder (str): Folder containing files to upload
- files (List[str]): Optional subset of files to upload from the folder
11class DataPortalProcess(DataPortalAsset): 12 """Helper functions for interacting with analysis processes.""" 13 _data: Process 14 15 def __init__(self, process: Process, client: CirroApi): 16 """ 17 Instantiate with helper method 18 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 process = portal.get_process_by_name("Process Name") 23 ``` 24 """ 25 self._data = process 26 self._client = client 27 28 @property 29 def id(self) -> str: 30 """Unique identifier""" 31 return self._data.id 32 33 @property 34 def name(self) -> str: 35 """Readable name""" 36 return self._data.name 37 38 @property 39 def description(self) -> str: 40 """Longer description of process""" 41 return self._data.description 42 43 @property 44 def child_process_ids(self) -> List[str]: 45 """List of processes which can be run on the output of this process""" 46 return self._data.child_process_ids 47 48 @property 49 def executor(self) -> Executor: 50 """INGEST, CROMWELL, or NEXTFLOW""" 51 return self._data.executor 52 53 @property 54 def documentation_url(self) -> str: 55 """Documentation URL""" 56 return self._data.documentation_url 57 58 @property 59 def file_requirements_message(self) -> str: 60 """Description of files required for INGEST processes""" 61 return self._data.file_requirements_message 62 63 def __str__(self): 64 return '\n'.join([ 65 f"{i.title()}: {self.__getattribute__(i)}" 66 for i in ['name', 'id', 'description'] 67 ]) 68 69 def get_parameter_spec(self) -> ParameterSpecification: 70 """ 71 Gets a specification used to describe the parameters used in the process. 72 """ 73 return self._client.processes.get_parameter_spec(self.id)
Helper functions for interacting with analysis processes.
15 def __init__(self, process: Process, client: CirroApi): 16 """ 17 Instantiate with helper method 18 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 process = portal.get_process_by_name("Process Name") 23 ``` 24 """ 25 self._data = process 26 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
process = portal.get_process_by_name("Process Name")
38 @property 39 def description(self) -> str: 40 """Longer description of process""" 41 return self._data.description
Longer description of process
43 @property 44 def child_process_ids(self) -> List[str]: 45 """List of processes which can be run on the output of this process""" 46 return self._data.child_process_ids
List of processes which can be run on the output of this process
48 @property 49 def executor(self) -> Executor: 50 """INGEST, CROMWELL, or NEXTFLOW""" 51 return self._data.executor
INGEST, CROMWELL, or NEXTFLOW
53 @property 54 def documentation_url(self) -> str: 55 """Documentation URL""" 56 return self._data.documentation_url
Documentation URL
58 @property 59 def file_requirements_message(self) -> str: 60 """Description of files required for INGEST processes""" 61 return self._data.file_requirements_message
Description of files required for INGEST processes
17class DataPortalDataset(DataPortalAsset): 18 """ 19 Datasets in the Data Portal are collections of files which have 20 either been uploaded directly, or which have been output by 21 an analysis pipeline or notebook. 22 """ 23 24 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 25 """ 26 Instantiate a dataset object 27 28 Should be invoked from a top-level constructor, for example: 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 dataset = portal.get_dataset( 34 project="id-or-name-of-project", 35 dataset="id-or-name-of-dataset" 36 ) 37 ``` 38 39 """ 40 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 41 self._data = dataset 42 self._files: Optional[List[FileEntry]] = None 43 self._client = client 44 45 @property 46 def id(self) -> str: 47 """Unique identifier for the dataset""" 48 return self._data.id 49 50 @property 51 def name(self) -> str: 52 """Editible name for the dataset""" 53 return self._data.name 54 55 @property 56 def description(self) -> str: 57 """Longer name for the dataset""" 58 return self._data.description 59 60 @property 61 def process_id(self) -> str: 62 """Unique ID of process used to create the dataset""" 63 return self._data.process_id 64 65 @property 66 def process(self) -> ProcessDetail: 67 """ 68 Object representing the process used to create the dataset 69 """ 70 return self._client.processes.get(self.process_id) 71 72 @property 73 def project_id(self) -> str: 74 """ID of the project containing the dataset""" 75 return self._data.project_id 76 77 @property 78 def status(self) -> Status: 79 """ 80 Status of the dataset 81 """ 82 return self._data.status 83 84 @property 85 def source_dataset_ids(self) -> List[str]: 86 """IDs of the datasets used as sources for this dataset (if any)""" 87 return self._data.source_dataset_ids 88 89 @property 90 def source_datasets(self) -> List['DataPortalDataset']: 91 """ 92 Objects representing the datasets used as sources for this dataset (if any) 93 """ 94 return [ 95 DataPortalDataset( 96 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 97 client=self._client 98 ) 99 for dataset_id in self.source_dataset_ids 100 ] 101 102 @property 103 def params(self) -> DatasetDetailParams: 104 """ 105 Parameters used to generate the dataset 106 """ 107 return self._get_detail().params 108 109 @property 110 def info(self) -> DatasetDetailInfo: 111 """ 112 Detailed information about the dataset 113 """ 114 return self._get_detail().info 115 116 @property 117 def tags(self) -> List[Tag]: 118 """ 119 Tags applied to the dataset 120 """ 121 return self._data.tags 122 123 @property 124 def created_by(self) -> str: 125 """User who created the dataset""" 126 return self._data.created_by 127 128 @property 129 def created_at(self) -> datetime.datetime: 130 """Timestamp of dataset creation""" 131 return self._data.created_at 132 133 def _get_detail(self): 134 if not isinstance(self._data, DatasetDetail): 135 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 136 return self._data 137 138 def __str__(self): 139 return '\n'.join([ 140 f"{i.title()}: {self.__getattribute__(i)}" 141 for i in ['name', 'id', 'description', 'status'] 142 ]) 143 144 def list_files(self) -> DataPortalFiles: 145 """ 146 Return the list of files which make up the dataset. 147 """ 148 if not self._files: 149 self._files = DataPortalFiles( 150 [ 151 DataPortalFile(file=file, client=self._client) 152 for file in self._client.datasets.get_file_listing( 153 project_id=self.project_id, 154 dataset_id=self.id 155 ) 156 ] 157 ) 158 return self._files 159 160 def download_files(self, download_location: str = None) -> None: 161 """ 162 Download all the files from the dataset to a local directory. 163 164 Args: 165 download_location (str): Path to local directory 166 """ 167 168 # Alias for internal method 169 self.list_files().download(download_location) 170 171 def run_analysis( 172 self, 173 name: str = None, 174 description: str = "", 175 process: Union[DataPortalProcess, str] = None, 176 params=None, 177 notifications_emails=None 178 ) -> str: 179 """ 180 Runs an analysis on a dataset, returns the ID of the newly created dataset. 181 182 The process can be provided as either a DataPortalProcess object, 183 or a string which corresponds to the name or ID of the process. 184 185 Args: 186 name (str): Name of newly created dataset 187 description (str): Description of newly created dataset 188 process (DataPortalProcess or str): Process to run 189 params (dict): Analysis parameters 190 notifications_emails (List[str]): Notification email address(es) 191 192 Returns: 193 dataset_id (str): ID of newly created dataset 194 """ 195 if name is None: 196 raise DataPortalInputError("Must specify 'name' for run_analysis") 197 if process is None: 198 raise DataPortalInputError("Must specify 'process' for run_analysis") 199 if notifications_emails is None: 200 notifications_emails = [] 201 if params is None: 202 params = {} 203 204 # If the process is a string, try to parse it as a process name or ID 205 process = parse_process_name_or_id(process, self._client) 206 207 resp = self._client.execution.run_analysis( 208 project_id=self.project_id, 209 request=RunAnalysisRequest( 210 name=name, 211 description=description, 212 process_id=process.id, 213 source_dataset_ids=[self.id], 214 params=RunAnalysisRequestParams.from_dict(params), 215 notification_emails=notifications_emails 216 ) 217 ) 218 return resp.id
Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.
24 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 25 """ 26 Instantiate a dataset object 27 28 Should be invoked from a top-level constructor, for example: 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 dataset = portal.get_dataset( 34 project="id-or-name-of-project", 35 dataset="id-or-name-of-dataset" 36 ) 37 ``` 38 39 """ 40 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 41 self._data = dataset 42 self._files: Optional[List[FileEntry]] = None 43 self._client = client
Instantiate a dataset object
Should be invoked from a top-level constructor, for example:
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
45 @property 46 def id(self) -> str: 47 """Unique identifier for the dataset""" 48 return self._data.id
Unique identifier for the dataset
50 @property 51 def name(self) -> str: 52 """Editible name for the dataset""" 53 return self._data.name
Editible name for the dataset
55 @property 56 def description(self) -> str: 57 """Longer name for the dataset""" 58 return self._data.description
Longer name for the dataset
60 @property 61 def process_id(self) -> str: 62 """Unique ID of process used to create the dataset""" 63 return self._data.process_id
Unique ID of process used to create the dataset
65 @property 66 def process(self) -> ProcessDetail: 67 """ 68 Object representing the process used to create the dataset 69 """ 70 return self._client.processes.get(self.process_id)
Object representing the process used to create the dataset
72 @property 73 def project_id(self) -> str: 74 """ID of the project containing the dataset""" 75 return self._data.project_id
ID of the project containing the dataset
77 @property 78 def status(self) -> Status: 79 """ 80 Status of the dataset 81 """ 82 return self._data.status
Status of the dataset
84 @property 85 def source_dataset_ids(self) -> List[str]: 86 """IDs of the datasets used as sources for this dataset (if any)""" 87 return self._data.source_dataset_ids
IDs of the datasets used as sources for this dataset (if any)
89 @property 90 def source_datasets(self) -> List['DataPortalDataset']: 91 """ 92 Objects representing the datasets used as sources for this dataset (if any) 93 """ 94 return [ 95 DataPortalDataset( 96 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 97 client=self._client 98 ) 99 for dataset_id in self.source_dataset_ids 100 ]
Objects representing the datasets used as sources for this dataset (if any)
102 @property 103 def params(self) -> DatasetDetailParams: 104 """ 105 Parameters used to generate the dataset 106 """ 107 return self._get_detail().params
Parameters used to generate the dataset
109 @property 110 def info(self) -> DatasetDetailInfo: 111 """ 112 Detailed information about the dataset 113 """ 114 return self._get_detail().info
Detailed information about the dataset
123 @property 124 def created_by(self) -> str: 125 """User who created the dataset""" 126 return self._data.created_by
User who created the dataset
128 @property 129 def created_at(self) -> datetime.datetime: 130 """Timestamp of dataset creation""" 131 return self._data.created_at
Timestamp of dataset creation
144 def list_files(self) -> DataPortalFiles: 145 """ 146 Return the list of files which make up the dataset. 147 """ 148 if not self._files: 149 self._files = DataPortalFiles( 150 [ 151 DataPortalFile(file=file, client=self._client) 152 for file in self._client.datasets.get_file_listing( 153 project_id=self.project_id, 154 dataset_id=self.id 155 ) 156 ] 157 ) 158 return self._files
Return the list of files which make up the dataset.
160 def download_files(self, download_location: str = None) -> None: 161 """ 162 Download all the files from the dataset to a local directory. 163 164 Args: 165 download_location (str): Path to local directory 166 """ 167 168 # Alias for internal method 169 self.list_files().download(download_location)
Download all the files from the dataset to a local directory.
Arguments:
- download_location (str): Path to local directory
171 def run_analysis( 172 self, 173 name: str = None, 174 description: str = "", 175 process: Union[DataPortalProcess, str] = None, 176 params=None, 177 notifications_emails=None 178 ) -> str: 179 """ 180 Runs an analysis on a dataset, returns the ID of the newly created dataset. 181 182 The process can be provided as either a DataPortalProcess object, 183 or a string which corresponds to the name or ID of the process. 184 185 Args: 186 name (str): Name of newly created dataset 187 description (str): Description of newly created dataset 188 process (DataPortalProcess or str): Process to run 189 params (dict): Analysis parameters 190 notifications_emails (List[str]): Notification email address(es) 191 192 Returns: 193 dataset_id (str): ID of newly created dataset 194 """ 195 if name is None: 196 raise DataPortalInputError("Must specify 'name' for run_analysis") 197 if process is None: 198 raise DataPortalInputError("Must specify 'process' for run_analysis") 199 if notifications_emails is None: 200 notifications_emails = [] 201 if params is None: 202 params = {} 203 204 # If the process is a string, try to parse it as a process name or ID 205 process = parse_process_name_or_id(process, self._client) 206 207 resp = self._client.execution.run_analysis( 208 project_id=self.project_id, 209 request=RunAnalysisRequest( 210 name=name, 211 description=description, 212 process_id=process.id, 213 source_dataset_ids=[self.id], 214 params=RunAnalysisRequestParams.from_dict(params), 215 notification_emails=notifications_emails 216 ) 217 ) 218 return resp.id
Runs an analysis on a dataset, returns the ID of the newly created dataset.
The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (DataPortalProcess or str): Process to run
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
Returns:
dataset_id (str): ID of newly created dataset
12class DataPortalReference(DataPortalAsset): 13 """ 14 Reference data object containing files which can be used for analysis in a particular project. 15 """ 16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ] 30 31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files 35 36 @property 37 def name(self) -> str: 38 """Reference name""" 39 return self._data.name 40 41 @property 42 def type(self) -> str: 43 """Type of reference data (e.g. genome_fasta)""" 44 return self._data.type 45 46 @property 47 def absolute_path(self): 48 if len(self._files) == 0: 49 return None 50 return self._files[0].absolute_path 51 52 def __str__(self): 53 return self.name
Reference data object containing files which can be used for analysis in a particular project.
16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ]
Instantiate by listing the references which have been added to a particular project
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
references = project.list_references()
31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files
File(s) contained in the reference
11class CirroApi: 12 """ 13 Client for interacting directly with the Cirro API 14 """ 15 def __init__(self, auth_info: AuthInfo = None, base_url: str = None): 16 """ 17 Instantiates the Cirro API object 18 19 Args: 20 auth_info (cirro.auth.base.AuthInfo): 21 base_url (str): Optional base URL for connection (default: `CIRRO_HOME` or 'cirro.bio') 22 23 Returns: 24 Authenticated Cirro API object which can be used to call endpoint functions. 25 For example: 26 27 ```python 28 from cirro.cirro_client import CirroApi 29 cirro = CirroApi() 30 print(cirro.projects.list()) 31 ``` 32 """ 33 34 self._configuration = AppConfig(base_url=base_url) 35 if not auth_info: 36 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 37 38 self._api_client = CirroApiClient( 39 base_url=self._configuration.rest_endpoint, 40 auth_method=auth_info.get_auth_method(), 41 client_name='Cirro SDK', 42 package_name='cirro' 43 ) 44 45 # Init services 46 self._file_service = FileService(self._api_client, 47 enable_additional_checksum=self._configuration.enable_additional_checksum, 48 transfer_retries=self._configuration.transfer_max_retries) 49 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 50 self._project_service = ProjectService(self._api_client) 51 self._process_service = ProcessService(self._api_client) 52 self._execution_service = ExecutionService(self._api_client) 53 self._metrics_service = MetricsService(self._api_client) 54 self._metadata_service = MetadataService(self._api_client) 55 self._billing_service = BillingService(self._api_client) 56 self._references_service = ReferenceService(self._api_client) 57 58 @property 59 def datasets(self) -> DatasetService: 60 """ 61 Create, list, delete, and modify Datasets 62 """ 63 return self._dataset_service 64 65 @property 66 def projects(self) -> ProjectService: 67 """ 68 Create, list, delete, and modify Projects 69 """ 70 return self._project_service 71 72 @property 73 def processes(self) -> ProcessService: 74 """ 75 List and retrieve detailed information about Processes 76 """ 77 return self._process_service 78 79 @property 80 def execution(self) -> ExecutionService: 81 """ 82 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 83 """ 84 return self._execution_service 85 86 @property 87 def metrics(self) -> MetricsService: 88 """ 89 Project-level summary metrics 90 """ 91 return self._metrics_service 92 93 @property 94 def metadata(self) -> MetadataService: 95 """ 96 List and modify Sample metadata or metadata schemas 97 """ 98 return self._metadata_service 99 100 @property 101 def billing(self) -> BillingService: 102 """ 103 List and update billing accounts 104 """ 105 return self._billing_service 106 107 @property 108 def references(self) -> ReferenceService: 109 """ 110 List References and Reference types 111 """ 112 return self._references_service 113 114 @property 115 def file(self) -> FileService: 116 """ 117 Read, download, and create file objects 118 """ 119 return self._file_service 120 121 @property 122 def api_client(self) -> CirroApiClient: 123 """ 124 Gets the underlying API client 125 """ 126 return self._api_client 127 128 @property 129 def configuration(self) -> AppConfig: 130 """ 131 Gets the configuration of the instance 132 """ 133 return self._configuration
Client for interacting directly with the Cirro API
15 def __init__(self, auth_info: AuthInfo = None, base_url: str = None): 16 """ 17 Instantiates the Cirro API object 18 19 Args: 20 auth_info (cirro.auth.base.AuthInfo): 21 base_url (str): Optional base URL for connection (default: `CIRRO_HOME` or 'cirro.bio') 22 23 Returns: 24 Authenticated Cirro API object which can be used to call endpoint functions. 25 For example: 26 27 ```python 28 from cirro.cirro_client import CirroApi 29 cirro = CirroApi() 30 print(cirro.projects.list()) 31 ``` 32 """ 33 34 self._configuration = AppConfig(base_url=base_url) 35 if not auth_info: 36 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 37 38 self._api_client = CirroApiClient( 39 base_url=self._configuration.rest_endpoint, 40 auth_method=auth_info.get_auth_method(), 41 client_name='Cirro SDK', 42 package_name='cirro' 43 ) 44 45 # Init services 46 self._file_service = FileService(self._api_client, 47 enable_additional_checksum=self._configuration.enable_additional_checksum, 48 transfer_retries=self._configuration.transfer_max_retries) 49 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 50 self._project_service = ProjectService(self._api_client) 51 self._process_service = ProcessService(self._api_client) 52 self._execution_service = ExecutionService(self._api_client) 53 self._metrics_service = MetricsService(self._api_client) 54 self._metadata_service = MetadataService(self._api_client) 55 self._billing_service = BillingService(self._api_client) 56 self._references_service = ReferenceService(self._api_client)
Instantiates the Cirro API object
Arguments:
- auth_info (cirro.auth.base.AuthInfo):
- base_url (str): Optional base URL for connection (default:
CIRRO_HOME
or 'cirro.bio')
Returns:
Authenticated Cirro API object which can be used to call endpoint functions. For example:
from cirro.cirro_client import CirroApi
cirro = CirroApi()
print(cirro.projects.list())
58 @property 59 def datasets(self) -> DatasetService: 60 """ 61 Create, list, delete, and modify Datasets 62 """ 63 return self._dataset_service
Create, list, delete, and modify Datasets
65 @property 66 def projects(self) -> ProjectService: 67 """ 68 Create, list, delete, and modify Projects 69 """ 70 return self._project_service
Create, list, delete, and modify Projects
72 @property 73 def processes(self) -> ProcessService: 74 """ 75 List and retrieve detailed information about Processes 76 """ 77 return self._process_service
List and retrieve detailed information about Processes
79 @property 80 def execution(self) -> ExecutionService: 81 """ 82 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 83 """ 84 return self._execution_service
List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)
86 @property 87 def metrics(self) -> MetricsService: 88 """ 89 Project-level summary metrics 90 """ 91 return self._metrics_service
Project-level summary metrics
93 @property 94 def metadata(self) -> MetadataService: 95 """ 96 List and modify Sample metadata or metadata schemas 97 """ 98 return self._metadata_service
List and modify Sample metadata or metadata schemas
100 @property 101 def billing(self) -> BillingService: 102 """ 103 List and update billing accounts 104 """ 105 return self._billing_service
List and update billing accounts
107 @property 108 def references(self) -> ReferenceService: 109 """ 110 List References and Reference types 111 """ 112 return self._references_service
List References and Reference types
114 @property 115 def file(self) -> FileService: 116 """ 117 Read, download, and create file objects 118 """ 119 return self._file_service
Read, download, and create file objects