cirro
1import cirro.file_utils # noqa 2from cirro.cirro_client import CirroApi 3from cirro.sdk.dataset import DataPortalDataset 4from cirro.sdk.login import DataPortalLogin 5from cirro.sdk.portal import DataPortal 6from cirro.sdk.process import DataPortalProcess 7from cirro.sdk.project import DataPortalProject 8from cirro.sdk.reference import DataPortalReference 9 10__all__ = [ 11 'DataPortal', 12 'DataPortalLogin', 13 'DataPortalProject', 14 'DataPortalProcess', 15 'DataPortalDataset', 16 'DataPortalReference', 17 'CirroApi', 18 'file_utils' 19]
13class DataPortal: 14 """ 15 Helper functions for exploring the Projects, Datasets, Samples, and Files 16 available in the Data Portal. 17 """ 18 19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 Portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url) 43 44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 ) 53 54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name) 58 59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id) 63 64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project) 78 79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 try: 105 return project.get_dataset_by_id(dataset) 106 except DataPortalAssetNotFound: 107 return project.get_dataset_by_name(dataset) 108 109 def list_processes(self, ingest=False) -> DataPortalProcesses: 110 """ 111 List all the processes available in the Data Portal. 112 By default, only list non-ingest processes (those which can be run on existing datasets). 113 To list the processes which can be used to upload datasets, use `ingest = True`. 114 115 Args: 116 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 117 """ 118 119 return DataPortalProcesses( 120 [ 121 DataPortalProcess(p, self._client) 122 for p in self._client.processes.list() 123 if not ingest or p.executor == Executor.INGEST 124 ] 125 ) 126 127 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 128 """ 129 Return the process with the specified name. 130 131 Args: 132 name (str): Name of process 133 """ 134 135 return self.list_processes(ingest=ingest).get_by_name(name) 136 137 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 138 """ 139 Return the process with the specified id 140 141 Args: 142 id (str): ID of process 143 """ 144 145 return self.list_processes(ingest=ingest).get_by_id(id) 146 147 def list_reference_types(self) -> DataPortalReferenceTypes: 148 """ 149 Return the list of all available reference types 150 """ 151 152 return DataPortalReferenceTypes( 153 [ 154 DataPortalReferenceType(ref) 155 for ref in self._client.references.get_types() 156 ] 157 ) 158 159 @property 160 def developer_helper(self) -> DeveloperHelper: 161 return DeveloperHelper(self._client)
Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.
19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 Portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url)
Set up the DataPortal object, establishing an authenticated connection.
Arguments:
- base_url (str): Optional base URL of the Cirro instance
(if not provided, it uses the
CIRRO_BASE_URL
environment variable, or the config file) - client (
cirro.cirro_client.CirroApi
): Optional pre-configured client
Example:
from cirro import DataPortal
Portal = DataPortal(base_url="app.cirro.bio")
portal.list_projects()
44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 )
List all the projects available in the Data Portal.
54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name)
Return the project with the specified name.
59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id)
Return the project with the specified id.
64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project)
Return a project identified by ID or name.
Arguments:
- project (str): ID or name of project
Returns:
from cirro.sdk.project import DataPortalProject
79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 try: 105 return project.get_dataset_by_id(dataset) 106 except DataPortalAssetNotFound: 107 return project.get_dataset_by_name(dataset)
Return a dataset identified by ID or name.
Arguments:
- project (str): ID or name of project
- dataset (str): ID or name of dataset
Returns:
cirro.sdk.dataset.DataPortalDataset
from cirro import DataPortal() portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", dataset="id-or-name-of-dataset" )
109 def list_processes(self, ingest=False) -> DataPortalProcesses: 110 """ 111 List all the processes available in the Data Portal. 112 By default, only list non-ingest processes (those which can be run on existing datasets). 113 To list the processes which can be used to upload datasets, use `ingest = True`. 114 115 Args: 116 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 117 """ 118 119 return DataPortalProcesses( 120 [ 121 DataPortalProcess(p, self._client) 122 for p in self._client.processes.list() 123 if not ingest or p.executor == Executor.INGEST 124 ] 125 )
List all the processes available in the Data Portal.
By default, only list non-ingest processes (those which can be run on existing datasets).
To list the processes which can be used to upload datasets, use ingest = True
.
Arguments:
- ingest (bool): If True, only list those processes which can be used to ingest datasets directly
127 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 128 """ 129 Return the process with the specified name. 130 131 Args: 132 name (str): Name of process 133 """ 134 135 return self.list_processes(ingest=ingest).get_by_name(name)
Return the process with the specified name.
Arguments:
- name (str): Name of process
137 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 138 """ 139 Return the process with the specified id 140 141 Args: 142 id (str): ID of process 143 """ 144 145 return self.list_processes(ingest=ingest).get_by_id(id)
Return the process with the specified id
Arguments:
- id (str): ID of process
147 def list_reference_types(self) -> DataPortalReferenceTypes: 148 """ 149 Return the list of all available reference types 150 """ 151 152 return DataPortalReferenceTypes( 153 [ 154 DataPortalReferenceType(ref) 155 for ref in self._client.references.get_types() 156 ] 157 )
Return the list of all available reference types
8class DataPortalLogin: 9 """ 10 Start the login process, obtaining the authorization message from Cirro 11 needed to confirm the user identity. 12 13 Useful when you need to authenticate a user in a non-blocking way. 14 15 Usage: 16 17 ```python 18 # Replace app.cirro.bio as appropriate 19 login = DataPortalLogin(base_url="app.cirro.bio") 20 21 # Present the user with the authorization message 22 print(login.auth_message) 23 24 # Generate the authenticated DataPortal object, 25 # blocking until the user completes the login process in their browser 26 portal = login.await_completion() 27 ``` 28 """ 29 base_url: str 30 auth_info: DeviceCodeAuth 31 32 def __init__(self, base_url: str = None, enable_cache=False): 33 app_config = AppConfig(base_url=base_url) 34 35 self.base_url = base_url 36 37 self.auth_info = DeviceCodeAuth( 38 region=app_config.region, 39 client_id=app_config.client_id, 40 auth_endpoint=app_config.auth_endpoint, 41 enable_cache=enable_cache, 42 await_completion=False 43 ) 44 45 @property 46 def auth_message(self) -> str: 47 """Authorization message provided by Cirro.""" 48 return self.auth_info.auth_message 49 50 @property 51 def auth_message_markdown(self) -> str: 52 """Authorization message provided by Cirro (Markdown format).""" 53 return self.auth_info.auth_message_markdown 54 55 def await_completion(self) -> DataPortal: 56 """Complete the login process and return an authenticated client""" 57 58 # Block until the user completes the login flow 59 self.auth_info.await_completion() 60 61 # Set up the client object 62 cirro_client = CirroApi( 63 auth_info=self.auth_info, 64 base_url=self.base_url 65 ) 66 67 # Return the Data Portal object 68 return DataPortal(client=cirro_client)
Start the login process, obtaining the authorization message from Cirro needed to confirm the user identity.
Useful when you need to authenticate a user in a non-blocking way.
Usage:
# Replace app.cirro.bio as appropriate
login = DataPortalLogin(base_url="app.cirro.bio")
# Present the user with the authorization message
print(login.auth_message)
# Generate the authenticated DataPortal object,
# blocking until the user completes the login process in their browser
portal = login.await_completion()
32 def __init__(self, base_url: str = None, enable_cache=False): 33 app_config = AppConfig(base_url=base_url) 34 35 self.base_url = base_url 36 37 self.auth_info = DeviceCodeAuth( 38 region=app_config.region, 39 client_id=app_config.client_id, 40 auth_endpoint=app_config.auth_endpoint, 41 enable_cache=enable_cache, 42 await_completion=False 43 )
45 @property 46 def auth_message(self) -> str: 47 """Authorization message provided by Cirro.""" 48 return self.auth_info.auth_message
Authorization message provided by Cirro.
50 @property 51 def auth_message_markdown(self) -> str: 52 """Authorization message provided by Cirro (Markdown format).""" 53 return self.auth_info.auth_message_markdown
Authorization message provided by Cirro (Markdown format).
55 def await_completion(self) -> DataPortal: 56 """Complete the login process and return an authenticated client""" 57 58 # Block until the user completes the login flow 59 self.auth_info.await_completion() 60 61 # Set up the client object 62 cirro_client = CirroApi( 63 auth_info=self.auth_info, 64 base_url=self.base_url 65 ) 66 67 # Return the Data Portal object 68 return DataPortal(client=cirro_client)
Complete the login process and return an authenticated client
20class DataPortalProject(DataPortalAsset): 21 """ 22 Projects in the Data Portal contain collections of Datasets. 23 Users are granted permissions at the project-level, allowing them 24 to view and/or modify all the datasets in that collection. 25 """ 26 def __init__(self, proj: Project, client: CirroApi): 27 """ 28 Instantiate with helper method 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 project = portal.get_project_by_name("Project Name") 34 ``` 35 36 """ 37 self._data = proj 38 self._client = client 39 40 @property 41 def id(self) -> str: 42 """ 43 Unique identifier 44 """ 45 return self._data.id 46 47 @property 48 def name(self) -> str: 49 """ 50 Readable name 51 """ 52 return self._data.name 53 54 @property 55 def description(self) -> str: 56 """ 57 Longer description of the project 58 """ 59 return self._data.description 60 61 @property 62 def status(self) -> Status: 63 """ 64 Status of the project 65 """ 66 return self._data.status 67 68 def __str__(self): 69 """Control how the Project is rendered as a string.""" 70 71 return '\n'.join([ 72 f"{i.title()}: {self.__getattribute__(i)}" 73 for i in ['name', 'id', 'description'] 74 ]) 75 76 @cache 77 def _get_datasets(self) -> List[Dataset]: 78 return list_all_datasets(project_id=self.id, 79 client=self._client) 80 81 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 82 """List all the datasets available in the project.""" 83 if force_refresh: 84 self._get_datasets.cache_clear() 85 86 return DataPortalDatasets( 87 [ 88 DataPortalDataset(d, self._client) 89 for d in self._get_datasets() 90 ] 91 ) 92 93 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 94 """Return the dataset with the specified name.""" 95 if force_refresh: 96 self._get_datasets.cache_clear() 97 98 dataset = next((d for d in self._get_datasets() if d.name == name), None) 99 if dataset is None: 100 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 101 return self.get_dataset_by_id(dataset.id) 102 103 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 104 """Return the dataset with the specified id.""" 105 106 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 107 if dataset is None: 108 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 109 return DataPortalDataset(dataset, self._client) 110 111 def list_references(self, reference_type: str = None) -> DataPortalReferences: 112 """ 113 List the references available in a project. 114 Optionally filter to references of a particular type (identified by name) 115 """ 116 117 # Get the complete list of references which are available 118 reference_types = DataPortalReferenceTypes( 119 [ 120 DataPortalReferenceType(ref) 121 for ref in self._client.references.get_types() 122 ] 123 ) 124 125 # If a particular name was specified 126 if reference_type is not None: 127 reference_types = reference_types.filter_by_pattern(reference_type) 128 if len(reference_types) == 0: 129 msg = f"Could not find any reference types with the name {reference_type}" 130 raise DataPortalAssetNotFound(msg) 131 132 return DataPortalReferences( 133 [ 134 DataPortalReference(ref, project_id=self.id, client=self._client) 135 for ref in self._client.references.get_for_project( 136 self.id 137 ) 138 if reference_type is None or ref.type == reference_type 139 ] 140 ) 141 142 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 143 """Return the reference of a particular type with the specified name.""" 144 145 if name is None: 146 raise DataPortalInputError("Must specify the reference name") 147 148 return self.list_references(ref_type).get_by_name(name) 149 150 def upload_dataset( 151 self, 152 name: str = None, 153 description='', 154 process: Union[DataPortalProcess, str] = None, 155 upload_folder: str = None, 156 files: List[str] = None, 157 tags: List[str] = None, 158 ): 159 """ 160 Upload a set of files to the Data Portal, creating a new dataset. 161 162 If the files parameter is not provided, it will upload all files in the upload folder 163 164 Args: 165 name (str): Name of newly created dataset 166 description (str): Description of newly created dataset 167 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 168 upload_folder (str): Folder containing files to upload 169 files (List[str]): Optional subset of files to upload from the folder 170 tags (List[str]): Optional list of tags to apply to the dataset 171 """ 172 173 if name is None: 174 raise DataPortalInputError("Must provide name for new dataset") 175 if process is None: 176 raise DataPortalInputError("Must provide the process which is used for ingest") 177 if upload_folder is None: 178 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 179 180 # Parse the process provided by the user 181 process = parse_process_name_or_id(process, self._client) 182 183 # If no files were provided 184 if files is None: 185 # Get the list of files in the upload folder 186 files = get_files_in_directory(upload_folder) 187 188 if files is None or len(files) == 0: 189 raise RuntimeWarning("No files to upload, exiting") 190 191 # Normalize into Tag object 192 if tags is not None: 193 tags = [Tag(value=value) for value in tags] 194 195 # Make sure that the files match the expected pattern 196 self._client.processes.check_dataset_files(files, process.id, upload_folder) 197 198 # Create the ingest process request 199 dataset_create_request = UploadDatasetRequest( 200 process_id=process.id, 201 name=name, 202 description=description, 203 expected_files=files, 204 tags=tags, 205 ) 206 207 # Get the response 208 create_response = self._client.datasets.create(project_id=self.id, 209 upload_request=dataset_create_request) 210 211 # Upload the files 212 self._client.datasets.upload_files( 213 project_id=self.id, 214 dataset_id=create_response.id, 215 directory=upload_folder, 216 files=files 217 ) 218 219 # Return the dataset which was created, which might take a second to update 220 max_attempts = 5 221 for attempt in range(max_attempts): 222 try: 223 return self.get_dataset_by_id(create_response.id) 224 except DataPortalAssetNotFound as e: 225 if attempt == max_attempts - 1: 226 raise e 227 else: 228 sleep(2) 229 230 def samples(self, max_items: int = 10000) -> List[Sample]: 231 """ 232 Retrieves a list of samples associated with a project along with their metadata 233 234 Args: 235 max_items (int): Maximum number of records to get (default 10,000) 236 """ 237 return self._client.metadata.get_project_samples(self.id, max_items)
Projects in the Data Portal contain collections of Datasets. Users are granted permissions at the project-level, allowing them to view and/or modify all the datasets in that collection.
26 def __init__(self, proj: Project, client: CirroApi): 27 """ 28 Instantiate with helper method 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 project = portal.get_project_by_name("Project Name") 34 ``` 35 36 """ 37 self._data = proj 38 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
54 @property 55 def description(self) -> str: 56 """ 57 Longer description of the project 58 """ 59 return self._data.description
Longer description of the project
61 @property 62 def status(self) -> Status: 63 """ 64 Status of the project 65 """ 66 return self._data.status
Status of the project
81 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 82 """List all the datasets available in the project.""" 83 if force_refresh: 84 self._get_datasets.cache_clear() 85 86 return DataPortalDatasets( 87 [ 88 DataPortalDataset(d, self._client) 89 for d in self._get_datasets() 90 ] 91 )
List all the datasets available in the project.
93 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 94 """Return the dataset with the specified name.""" 95 if force_refresh: 96 self._get_datasets.cache_clear() 97 98 dataset = next((d for d in self._get_datasets() if d.name == name), None) 99 if dataset is None: 100 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 101 return self.get_dataset_by_id(dataset.id)
Return the dataset with the specified name.
103 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 104 """Return the dataset with the specified id.""" 105 106 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 107 if dataset is None: 108 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 109 return DataPortalDataset(dataset, self._client)
Return the dataset with the specified id.
111 def list_references(self, reference_type: str = None) -> DataPortalReferences: 112 """ 113 List the references available in a project. 114 Optionally filter to references of a particular type (identified by name) 115 """ 116 117 # Get the complete list of references which are available 118 reference_types = DataPortalReferenceTypes( 119 [ 120 DataPortalReferenceType(ref) 121 for ref in self._client.references.get_types() 122 ] 123 ) 124 125 # If a particular name was specified 126 if reference_type is not None: 127 reference_types = reference_types.filter_by_pattern(reference_type) 128 if len(reference_types) == 0: 129 msg = f"Could not find any reference types with the name {reference_type}" 130 raise DataPortalAssetNotFound(msg) 131 132 return DataPortalReferences( 133 [ 134 DataPortalReference(ref, project_id=self.id, client=self._client) 135 for ref in self._client.references.get_for_project( 136 self.id 137 ) 138 if reference_type is None or ref.type == reference_type 139 ] 140 )
List the references available in a project. Optionally filter to references of a particular type (identified by name)
142 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 143 """Return the reference of a particular type with the specified name.""" 144 145 if name is None: 146 raise DataPortalInputError("Must specify the reference name") 147 148 return self.list_references(ref_type).get_by_name(name)
Return the reference of a particular type with the specified name.
150 def upload_dataset( 151 self, 152 name: str = None, 153 description='', 154 process: Union[DataPortalProcess, str] = None, 155 upload_folder: str = None, 156 files: List[str] = None, 157 tags: List[str] = None, 158 ): 159 """ 160 Upload a set of files to the Data Portal, creating a new dataset. 161 162 If the files parameter is not provided, it will upload all files in the upload folder 163 164 Args: 165 name (str): Name of newly created dataset 166 description (str): Description of newly created dataset 167 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 168 upload_folder (str): Folder containing files to upload 169 files (List[str]): Optional subset of files to upload from the folder 170 tags (List[str]): Optional list of tags to apply to the dataset 171 """ 172 173 if name is None: 174 raise DataPortalInputError("Must provide name for new dataset") 175 if process is None: 176 raise DataPortalInputError("Must provide the process which is used for ingest") 177 if upload_folder is None: 178 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 179 180 # Parse the process provided by the user 181 process = parse_process_name_or_id(process, self._client) 182 183 # If no files were provided 184 if files is None: 185 # Get the list of files in the upload folder 186 files = get_files_in_directory(upload_folder) 187 188 if files is None or len(files) == 0: 189 raise RuntimeWarning("No files to upload, exiting") 190 191 # Normalize into Tag object 192 if tags is not None: 193 tags = [Tag(value=value) for value in tags] 194 195 # Make sure that the files match the expected pattern 196 self._client.processes.check_dataset_files(files, process.id, upload_folder) 197 198 # Create the ingest process request 199 dataset_create_request = UploadDatasetRequest( 200 process_id=process.id, 201 name=name, 202 description=description, 203 expected_files=files, 204 tags=tags, 205 ) 206 207 # Get the response 208 create_response = self._client.datasets.create(project_id=self.id, 209 upload_request=dataset_create_request) 210 211 # Upload the files 212 self._client.datasets.upload_files( 213 project_id=self.id, 214 dataset_id=create_response.id, 215 directory=upload_folder, 216 files=files 217 ) 218 219 # Return the dataset which was created, which might take a second to update 220 max_attempts = 5 221 for attempt in range(max_attempts): 222 try: 223 return self.get_dataset_by_id(create_response.id) 224 except DataPortalAssetNotFound as e: 225 if attempt == max_attempts - 1: 226 raise e 227 else: 228 sleep(2)
Upload a set of files to the Data Portal, creating a new dataset.
If the files parameter is not provided, it will upload all files in the upload folder
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
- upload_folder (str): Folder containing files to upload
- files (List[str]): Optional subset of files to upload from the folder
- tags (List[str]): Optional list of tags to apply to the dataset
230 def samples(self, max_items: int = 10000) -> List[Sample]: 231 """ 232 Retrieves a list of samples associated with a project along with their metadata 233 234 Args: 235 max_items (int): Maximum number of records to get (default 10,000) 236 """ 237 return self._client.metadata.get_project_samples(self.id, max_items)
Retrieves a list of samples associated with a project along with their metadata
Arguments:
- max_items (int): Maximum number of records to get (default 10,000)
11class DataPortalProcess(DataPortalAsset): 12 """Helper functions for interacting with analysis processes.""" 13 14 def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi): 15 """ 16 Instantiate with helper method 17 18 ```python 19 from cirro import DataPortal() 20 portal = DataPortal() 21 process = portal.get_process_by_name("Process Name") 22 ``` 23 """ 24 self._data = process 25 self._client = client 26 27 @property 28 def id(self) -> str: 29 """Unique identifier""" 30 return self._data.id 31 32 @property 33 def name(self) -> str: 34 """Readable name""" 35 return self._data.name 36 37 @property 38 def description(self) -> str: 39 """Longer description of process""" 40 return self._data.description 41 42 @property 43 def child_process_ids(self) -> List[str]: 44 """List of processes which can be run on the output of this process""" 45 return self._data.child_process_ids 46 47 @property 48 def executor(self) -> Executor: 49 """INGEST, CROMWELL, or NEXTFLOW""" 50 return self._data.executor 51 52 @property 53 def category(self) -> str: 54 """Category of process""" 55 return self._data.category 56 57 @property 58 def pipeline_type(self) -> str: 59 """Pipeline type""" 60 return self._data.pipeline_type 61 62 @property 63 def documentation_url(self) -> str: 64 """Documentation URL""" 65 return self._data.documentation_url 66 67 @property 68 def file_requirements_message(self) -> str: 69 """Description of files required for INGEST processes""" 70 return self._data.file_requirements_message 71 72 @property 73 def code(self) -> PipelineCode: 74 """Pipeline code configuration""" 75 return self._get_detail().pipeline_code 76 77 @property 78 def custom_settings(self) -> CustomPipelineSettings: 79 """Custom settings for the process""" 80 return self._get_detail().custom_settings 81 82 def _get_detail(self) -> ProcessDetail: 83 if not isinstance(self._data, ProcessDetail): 84 self._data = self._client.processes.get(self.id) 85 return self._data 86 87 def __str__(self): 88 return '\n'.join([ 89 f"{i.title()}: {self.__getattribute__(i)}" 90 for i in ['name', 'id', 'description'] 91 ]) 92 93 def get_parameter_spec(self) -> ParameterSpecification: 94 """ 95 Gets a specification used to describe the parameters used in the process. 96 """ 97 return self._client.processes.get_parameter_spec(self.id)
Helper functions for interacting with analysis processes.
14 def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi): 15 """ 16 Instantiate with helper method 17 18 ```python 19 from cirro import DataPortal() 20 portal = DataPortal() 21 process = portal.get_process_by_name("Process Name") 22 ``` 23 """ 24 self._data = process 25 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
process = portal.get_process_by_name("Process Name")
37 @property 38 def description(self) -> str: 39 """Longer description of process""" 40 return self._data.description
Longer description of process
42 @property 43 def child_process_ids(self) -> List[str]: 44 """List of processes which can be run on the output of this process""" 45 return self._data.child_process_ids
List of processes which can be run on the output of this process
47 @property 48 def executor(self) -> Executor: 49 """INGEST, CROMWELL, or NEXTFLOW""" 50 return self._data.executor
INGEST, CROMWELL, or NEXTFLOW
52 @property 53 def category(self) -> str: 54 """Category of process""" 55 return self._data.category
Category of process
57 @property 58 def pipeline_type(self) -> str: 59 """Pipeline type""" 60 return self._data.pipeline_type
Pipeline type
62 @property 63 def documentation_url(self) -> str: 64 """Documentation URL""" 65 return self._data.documentation_url
Documentation URL
67 @property 68 def file_requirements_message(self) -> str: 69 """Description of files required for INGEST processes""" 70 return self._data.file_requirements_message
Description of files required for INGEST processes
72 @property 73 def code(self) -> PipelineCode: 74 """Pipeline code configuration""" 75 return self._get_detail().pipeline_code
Pipeline code configuration
77 @property 78 def custom_settings(self) -> CustomPipelineSettings: 79 """Custom settings for the process""" 80 return self._get_detail().custom_settings
Custom settings for the process
21class DataPortalDataset(DataPortalAsset): 22 """ 23 Datasets in the Data Portal are collections of files which have 24 either been uploaded directly, or which have been output by 25 an analysis pipeline or notebook. 26 """ 27 28 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 29 """ 30 Instantiate a dataset object 31 32 Should be invoked from a top-level constructor, for example: 33 34 ```python 35 from cirro import DataPortal() 36 portal = DataPortal() 37 dataset = portal.get_dataset( 38 project="id-or-name-of-project", 39 dataset="id-or-name-of-dataset" 40 ) 41 ``` 42 43 """ 44 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 45 self._data = dataset 46 self._assets: Optional[DatasetAssets] = None 47 self._client = client 48 49 @property 50 def id(self) -> str: 51 """Unique identifier for the dataset""" 52 return self._data.id 53 54 @property 55 def name(self) -> str: 56 """Editable name for the dataset""" 57 return self._data.name 58 59 @property 60 def description(self) -> str: 61 """Longer name for the dataset""" 62 return self._data.description 63 64 @property 65 def process_id(self) -> str: 66 """Unique ID of process used to create the dataset""" 67 return self._data.process_id 68 69 @property 70 def process(self) -> ProcessDetail: 71 """ 72 Object representing the process used to create the dataset 73 """ 74 return self._client.processes.get(self.process_id) 75 76 @property 77 def project_id(self) -> str: 78 """ID of the project containing the dataset""" 79 return self._data.project_id 80 81 @property 82 def status(self) -> Status: 83 """ 84 Status of the dataset 85 """ 86 return self._data.status 87 88 @property 89 def source_dataset_ids(self) -> List[str]: 90 """IDs of the datasets used as sources for this dataset (if any)""" 91 return self._data.source_dataset_ids 92 93 @property 94 def source_datasets(self) -> List['DataPortalDataset']: 95 """ 96 Objects representing the datasets used as sources for this dataset (if any) 97 """ 98 return [ 99 DataPortalDataset( 100 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 101 client=self._client 102 ) 103 for dataset_id in self.source_dataset_ids 104 ] 105 106 @property 107 def params(self) -> dict: 108 """ 109 Parameters used to generate the dataset 110 """ 111 return self._get_detail().params.to_dict() 112 113 @property 114 def info(self) -> dict: 115 """ 116 Extra information about the dataset 117 """ 118 return self._get_detail().info.to_dict() 119 120 @property 121 def tags(self) -> List[Tag]: 122 """ 123 Tags applied to the dataset 124 """ 125 return self._data.tags 126 127 @property 128 def share(self) -> Optional[NamedItem]: 129 """ 130 Share associated with the dataset, if any. 131 """ 132 return self._get_detail().share 133 134 @property 135 def created_by(self) -> str: 136 """User who created the dataset""" 137 return self._data.created_by 138 139 @property 140 def created_at(self) -> datetime.datetime: 141 """Timestamp of dataset creation""" 142 return self._data.created_at 143 144 def _get_detail(self): 145 if not isinstance(self._data, DatasetDetail): 146 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 147 return self._data 148 149 def _get_assets(self): 150 if not self._assets: 151 self._assets = self._client.datasets.get_assets_listing( 152 project_id=self.project_id, 153 dataset_id=self.id 154 ) 155 return self._assets 156 157 def __str__(self): 158 return '\n'.join([ 159 f"{i.title()}: {self.__getattribute__(i)}" 160 for i in ['name', 'id', 'description', 'status'] 161 ]) 162 163 def get_file(self, relative_path: str) -> DataPortalFile: 164 """ 165 Get a file from the dataset using its relative path. 166 167 Args: 168 relative_path (str): Relative path of file within the dataset 169 170 Returns: 171 `from cirro.sdk.file import DataPortalFile` 172 """ 173 174 # Get the list of files in this dataset 175 files = self.list_files() 176 177 # Try getting the file using the relative path provided by the user 178 try: 179 return files.get_by_id(relative_path) 180 except DataPortalAssetNotFound: 181 # Try getting the file with the 'data/' prefix prepended 182 try: 183 return files.get_by_id("data/" + relative_path) 184 except DataPortalAssetNotFound: 185 # If not found, raise the exception using the string provided 186 # by the user, not the data/ prepended version (which may be 187 # confusing to the user) 188 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 189 raise DataPortalAssetNotFound(msg) 190 191 def list_files(self) -> DataPortalFiles: 192 """ 193 Return the list of files which make up the dataset. 194 """ 195 files = self._get_assets().files 196 return DataPortalFiles( 197 [ 198 DataPortalFile(file=file, client=self._client) 199 for file in files 200 ] 201 ) 202 203 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 204 """ 205 Get the artifact of a particular type from the dataset 206 """ 207 artifacts = self._get_assets().artifacts 208 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 209 if artifact is None: 210 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 211 return DataPortalFile(file=artifact.file, client=self._client) 212 213 def list_artifacts(self) -> List[DataPortalFile]: 214 """ 215 Return the list of artifacts associated with the dataset 216 217 An artifact may be something generated as part of the analysis or other process. 218 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 219 220 """ 221 artifacts = self._get_assets().artifacts 222 return DataPortalFiles( 223 [ 224 DataPortalFile(file=artifact.file, client=self._client) 225 for artifact in artifacts 226 ] 227 ) 228 229 def download_files(self, download_location: str = None) -> None: 230 """ 231 Download all the files from the dataset to a local directory. 232 233 Args: 234 download_location (str): Path to local directory 235 """ 236 237 # Alias for internal method 238 self.list_files().download(download_location) 239 240 def run_analysis( 241 self, 242 name: str = None, 243 description: str = "", 244 process: Union[DataPortalProcess, str] = None, 245 params=None, 246 notifications_emails: List[str] = None, 247 compute_environment: str = None, 248 resume_dataset_id: str = None 249 ) -> str: 250 """ 251 Runs an analysis on a dataset, returns the ID of the newly created dataset. 252 253 The process can be provided as either a DataPortalProcess object, 254 or a string which corresponds to the name or ID of the process. 255 256 Args: 257 name (str): Name of newly created dataset 258 description (str): Description of newly created dataset 259 process (DataPortalProcess or str): Process to run 260 params (dict): Analysis parameters 261 notifications_emails (List[str]): Notification email address(es) 262 compute_environment (str): Name or ID of compute environment to use, 263 if blank it will run in AWS 264 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 265 It will attempt to re-use the previous output to minimize duplicate work 266 267 Returns: 268 dataset_id (str): ID of newly created dataset 269 """ 270 if name is None: 271 raise DataPortalInputError("Must specify 'name' for run_analysis") 272 if process is None: 273 raise DataPortalInputError("Must specify 'process' for run_analysis") 274 if notifications_emails is None: 275 notifications_emails = [] 276 if params is None: 277 params = {} 278 279 # If the process is a string, try to parse it as a process name or ID 280 process = parse_process_name_or_id(process, self._client) 281 282 if compute_environment: 283 compute_environments = self._client.compute_environments.list_environments_for_project( 284 project_id=self.project_id 285 ) 286 compute_environment = next( 287 (env for env in compute_environments 288 if env.name == compute_environment or env.id == compute_environment), 289 None 290 ) 291 if compute_environment is None: 292 raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") 293 294 resp = self._client.execution.run_analysis( 295 project_id=self.project_id, 296 request=RunAnalysisRequest( 297 name=name, 298 description=description, 299 process_id=process.id, 300 source_dataset_ids=[self.id], 301 params=RunAnalysisRequestParams.from_dict(params), 302 notification_emails=notifications_emails, 303 resume_dataset_id=resume_dataset_id, 304 compute_environment_id=compute_environment.id if compute_environment else None 305 ) 306 ) 307 return resp.id 308 309 def update_samplesheet(self, 310 contents: str = None, 311 file_path: PathLike = None): 312 """ 313 Updates the samplesheet metadata of a dataset. 314 Provide either the contents (as a string) or a file path. 315 Both must be in the format of a CSV. 316 317 Args: 318 contents (str): Samplesheet contents to update (should be a CSV string) 319 file_path (PathLike): Path of file to update (should be a CSV file) 320 321 Example: 322 ```python 323 dataset.update_samplesheet( 324 file_path=Path('~/samplesheet.csv') 325 ) 326 ``` 327 """ 328 329 if contents is None and file_path is None: 330 raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet") 331 332 if self.process.executor != Executor.INGEST: 333 raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset") 334 335 samplesheet_contents = contents 336 if file_path is not None: 337 samplesheet_contents = Path(file_path).expanduser().read_text() 338 339 # Validate samplesheet 340 file_names = [f.file_name for f in self.list_files()] 341 request = ValidateFileRequirementsRequest( 342 file_names=file_names, 343 sample_sheet=samplesheet_contents, 344 ) 345 requirements = validate_file_requirements.sync(process_id=self.process_id, 346 body=request, 347 client=self._client.api_client) 348 if error_msg := requirements.error_msg: 349 raise DataPortalInputError(error_msg) 350 351 # Update the samplesheet if everything looks ok 352 self._client.datasets.update_samplesheet( 353 project_id=self.project_id, 354 dataset_id=self.id, 355 samplesheet=samplesheet_contents 356 )
Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.
28 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 29 """ 30 Instantiate a dataset object 31 32 Should be invoked from a top-level constructor, for example: 33 34 ```python 35 from cirro import DataPortal() 36 portal = DataPortal() 37 dataset = portal.get_dataset( 38 project="id-or-name-of-project", 39 dataset="id-or-name-of-dataset" 40 ) 41 ``` 42 43 """ 44 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 45 self._data = dataset 46 self._assets: Optional[DatasetAssets] = None 47 self._client = client
Instantiate a dataset object
Should be invoked from a top-level constructor, for example:
from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
49 @property 50 def id(self) -> str: 51 """Unique identifier for the dataset""" 52 return self._data.id
Unique identifier for the dataset
54 @property 55 def name(self) -> str: 56 """Editable name for the dataset""" 57 return self._data.name
Editable name for the dataset
59 @property 60 def description(self) -> str: 61 """Longer name for the dataset""" 62 return self._data.description
Longer name for the dataset
64 @property 65 def process_id(self) -> str: 66 """Unique ID of process used to create the dataset""" 67 return self._data.process_id
Unique ID of process used to create the dataset
69 @property 70 def process(self) -> ProcessDetail: 71 """ 72 Object representing the process used to create the dataset 73 """ 74 return self._client.processes.get(self.process_id)
Object representing the process used to create the dataset
76 @property 77 def project_id(self) -> str: 78 """ID of the project containing the dataset""" 79 return self._data.project_id
ID of the project containing the dataset
81 @property 82 def status(self) -> Status: 83 """ 84 Status of the dataset 85 """ 86 return self._data.status
Status of the dataset
88 @property 89 def source_dataset_ids(self) -> List[str]: 90 """IDs of the datasets used as sources for this dataset (if any)""" 91 return self._data.source_dataset_ids
IDs of the datasets used as sources for this dataset (if any)
93 @property 94 def source_datasets(self) -> List['DataPortalDataset']: 95 """ 96 Objects representing the datasets used as sources for this dataset (if any) 97 """ 98 return [ 99 DataPortalDataset( 100 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 101 client=self._client 102 ) 103 for dataset_id in self.source_dataset_ids 104 ]
Objects representing the datasets used as sources for this dataset (if any)
106 @property 107 def params(self) -> dict: 108 """ 109 Parameters used to generate the dataset 110 """ 111 return self._get_detail().params.to_dict()
Parameters used to generate the dataset
113 @property 114 def info(self) -> dict: 115 """ 116 Extra information about the dataset 117 """ 118 return self._get_detail().info.to_dict()
Extra information about the dataset
134 @property 135 def created_by(self) -> str: 136 """User who created the dataset""" 137 return self._data.created_by
User who created the dataset
139 @property 140 def created_at(self) -> datetime.datetime: 141 """Timestamp of dataset creation""" 142 return self._data.created_at
Timestamp of dataset creation
163 def get_file(self, relative_path: str) -> DataPortalFile: 164 """ 165 Get a file from the dataset using its relative path. 166 167 Args: 168 relative_path (str): Relative path of file within the dataset 169 170 Returns: 171 `from cirro.sdk.file import DataPortalFile` 172 """ 173 174 # Get the list of files in this dataset 175 files = self.list_files() 176 177 # Try getting the file using the relative path provided by the user 178 try: 179 return files.get_by_id(relative_path) 180 except DataPortalAssetNotFound: 181 # Try getting the file with the 'data/' prefix prepended 182 try: 183 return files.get_by_id("data/" + relative_path) 184 except DataPortalAssetNotFound: 185 # If not found, raise the exception using the string provided 186 # by the user, not the data/ prepended version (which may be 187 # confusing to the user) 188 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 189 raise DataPortalAssetNotFound(msg)
Get a file from the dataset using its relative path.
Arguments:
- relative_path (str): Relative path of file within the dataset
Returns:
from cirro.sdk.file import DataPortalFile
191 def list_files(self) -> DataPortalFiles: 192 """ 193 Return the list of files which make up the dataset. 194 """ 195 files = self._get_assets().files 196 return DataPortalFiles( 197 [ 198 DataPortalFile(file=file, client=self._client) 199 for file in files 200 ] 201 )
Return the list of files which make up the dataset.
203 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 204 """ 205 Get the artifact of a particular type from the dataset 206 """ 207 artifacts = self._get_assets().artifacts 208 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 209 if artifact is None: 210 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 211 return DataPortalFile(file=artifact.file, client=self._client)
Get the artifact of a particular type from the dataset
213 def list_artifacts(self) -> List[DataPortalFile]: 214 """ 215 Return the list of artifacts associated with the dataset 216 217 An artifact may be something generated as part of the analysis or other process. 218 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 219 220 """ 221 artifacts = self._get_assets().artifacts 222 return DataPortalFiles( 223 [ 224 DataPortalFile(file=artifact.file, client=self._client) 225 for artifact in artifacts 226 ] 227 )
Return the list of artifacts associated with the dataset
An artifact may be something generated as part of the analysis or other process.
See cirro_api_client.v1.models.ArtifactType
for the list of possible artifact types.
229 def download_files(self, download_location: str = None) -> None: 230 """ 231 Download all the files from the dataset to a local directory. 232 233 Args: 234 download_location (str): Path to local directory 235 """ 236 237 # Alias for internal method 238 self.list_files().download(download_location)
Download all the files from the dataset to a local directory.
Arguments:
- download_location (str): Path to local directory
240 def run_analysis( 241 self, 242 name: str = None, 243 description: str = "", 244 process: Union[DataPortalProcess, str] = None, 245 params=None, 246 notifications_emails: List[str] = None, 247 compute_environment: str = None, 248 resume_dataset_id: str = None 249 ) -> str: 250 """ 251 Runs an analysis on a dataset, returns the ID of the newly created dataset. 252 253 The process can be provided as either a DataPortalProcess object, 254 or a string which corresponds to the name or ID of the process. 255 256 Args: 257 name (str): Name of newly created dataset 258 description (str): Description of newly created dataset 259 process (DataPortalProcess or str): Process to run 260 params (dict): Analysis parameters 261 notifications_emails (List[str]): Notification email address(es) 262 compute_environment (str): Name or ID of compute environment to use, 263 if blank it will run in AWS 264 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 265 It will attempt to re-use the previous output to minimize duplicate work 266 267 Returns: 268 dataset_id (str): ID of newly created dataset 269 """ 270 if name is None: 271 raise DataPortalInputError("Must specify 'name' for run_analysis") 272 if process is None: 273 raise DataPortalInputError("Must specify 'process' for run_analysis") 274 if notifications_emails is None: 275 notifications_emails = [] 276 if params is None: 277 params = {} 278 279 # If the process is a string, try to parse it as a process name or ID 280 process = parse_process_name_or_id(process, self._client) 281 282 if compute_environment: 283 compute_environments = self._client.compute_environments.list_environments_for_project( 284 project_id=self.project_id 285 ) 286 compute_environment = next( 287 (env for env in compute_environments 288 if env.name == compute_environment or env.id == compute_environment), 289 None 290 ) 291 if compute_environment is None: 292 raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") 293 294 resp = self._client.execution.run_analysis( 295 project_id=self.project_id, 296 request=RunAnalysisRequest( 297 name=name, 298 description=description, 299 process_id=process.id, 300 source_dataset_ids=[self.id], 301 params=RunAnalysisRequestParams.from_dict(params), 302 notification_emails=notifications_emails, 303 resume_dataset_id=resume_dataset_id, 304 compute_environment_id=compute_environment.id if compute_environment else None 305 ) 306 ) 307 return resp.id
Runs an analysis on a dataset, returns the ID of the newly created dataset.
The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (DataPortalProcess or str): Process to run
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
- compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
- resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
Returns:
dataset_id (str): ID of newly created dataset
309 def update_samplesheet(self, 310 contents: str = None, 311 file_path: PathLike = None): 312 """ 313 Updates the samplesheet metadata of a dataset. 314 Provide either the contents (as a string) or a file path. 315 Both must be in the format of a CSV. 316 317 Args: 318 contents (str): Samplesheet contents to update (should be a CSV string) 319 file_path (PathLike): Path of file to update (should be a CSV file) 320 321 Example: 322 ```python 323 dataset.update_samplesheet( 324 file_path=Path('~/samplesheet.csv') 325 ) 326 ``` 327 """ 328 329 if contents is None and file_path is None: 330 raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet") 331 332 if self.process.executor != Executor.INGEST: 333 raise DataPortalInputError("Cannot update a samplesheet on a non-ingest dataset") 334 335 samplesheet_contents = contents 336 if file_path is not None: 337 samplesheet_contents = Path(file_path).expanduser().read_text() 338 339 # Validate samplesheet 340 file_names = [f.file_name for f in self.list_files()] 341 request = ValidateFileRequirementsRequest( 342 file_names=file_names, 343 sample_sheet=samplesheet_contents, 344 ) 345 requirements = validate_file_requirements.sync(process_id=self.process_id, 346 body=request, 347 client=self._client.api_client) 348 if error_msg := requirements.error_msg: 349 raise DataPortalInputError(error_msg) 350 351 # Update the samplesheet if everything looks ok 352 self._client.datasets.update_samplesheet( 353 project_id=self.project_id, 354 dataset_id=self.id, 355 samplesheet=samplesheet_contents 356 )
Updates the samplesheet metadata of a dataset. Provide either the contents (as a string) or a file path. Both must be in the format of a CSV.
Arguments:
- contents (str): Samplesheet contents to update (should be a CSV string)
- file_path (PathLike): Path of file to update (should be a CSV file)
Example:
dataset.update_samplesheet(
file_path=Path('~/samplesheet.csv')
)
12class DataPortalReference(DataPortalAsset): 13 """ 14 Reference data object containing files which can be used for analysis in a particular project. 15 """ 16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ] 30 31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files 35 36 @property 37 def name(self) -> str: 38 """Reference name""" 39 return self._data.name 40 41 @property 42 def type(self) -> str: 43 """Type of reference data (e.g. genome_fasta)""" 44 return self._data.type 45 46 @property 47 def absolute_path(self): 48 if len(self._files) == 0: 49 return None 50 return self._files[0].absolute_path 51 52 def __str__(self): 53 return self.name
Reference data object containing files which can be used for analysis in a particular project.
16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ]
Instantiate by listing the references which have been added to a particular project
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
references = project.list_references()
31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files
File(s) contained in the reference
12class CirroApi: 13 """ 14 Client for interacting directly with the Cirro API 15 """ 16 def __init__(self, auth_info: AuthInfo = None, base_url: str = None): 17 """ 18 Instantiates the Cirro API object 19 20 Args: 21 auth_info (cirro.auth.base.AuthInfo): 22 base_url (str): Optional base URL of the Cirro instance 23 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 24 25 Returns: 26 Authenticated Cirro API object, which can be used to call endpoint functions. 27 28 Example: 29 ```python 30 from cirro.cirro_client import CirroApi 31 32 cirro = CirroApi(base_url="app.cirro.bio") 33 print(cirro.projects.list()) 34 ``` 35 """ 36 37 self._configuration = AppConfig(base_url=base_url) 38 if not auth_info: 39 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 40 41 self._api_client = CirroApiClient( 42 base_url=self._configuration.rest_endpoint, 43 auth_method=auth_info.get_auth_method(), 44 client_name='Cirro SDK', 45 package_name='cirro' 46 ) 47 48 # Init services 49 self._file_service = FileService(self._api_client, 50 checksum_method=self._configuration.checksum_method, 51 transfer_retries=self._configuration.transfer_max_retries) 52 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 53 self._project_service = ProjectService(self._api_client) 54 self._process_service = ProcessService(self._api_client) 55 self._execution_service = ExecutionService(self._api_client) 56 self._compute_environment_service = ComputeEnvironmentService(self._api_client) 57 self._metrics_service = MetricsService(self._api_client) 58 self._metadata_service = MetadataService(self._api_client) 59 self._billing_service = BillingService(self._api_client) 60 self._references_service = ReferenceService(self._api_client, file_service=self._file_service) 61 self._shares_service = ShareService(self._api_client) 62 self._users_service = UserService(self._api_client) 63 64 @property 65 def datasets(self) -> DatasetService: 66 """ 67 Create, list, delete, and modify Datasets 68 """ 69 return self._dataset_service 70 71 @property 72 def projects(self) -> ProjectService: 73 """ 74 Create, list, delete, and modify Projects 75 """ 76 return self._project_service 77 78 @property 79 def processes(self) -> ProcessService: 80 """ 81 List and retrieve detailed information about Processes 82 """ 83 return self._process_service 84 85 @property 86 def execution(self) -> ExecutionService: 87 """ 88 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 89 """ 90 return self._execution_service 91 92 @property 93 def compute_environments(self) -> ComputeEnvironmentService: 94 """ 95 List and update compute environments 96 """ 97 return self._compute_environment_service 98 99 @property 100 def metrics(self) -> MetricsService: 101 """ 102 Project-level summary metrics 103 """ 104 return self._metrics_service 105 106 @property 107 def metadata(self) -> MetadataService: 108 """ 109 List and modify Sample metadata or metadata schemas 110 """ 111 return self._metadata_service 112 113 @property 114 def billing(self) -> BillingService: 115 """ 116 List and update billing accounts 117 """ 118 return self._billing_service 119 120 @property 121 def references(self) -> ReferenceService: 122 """ 123 List References and Reference types 124 """ 125 return self._references_service 126 127 @property 128 def shares(self) -> ShareService: 129 """ 130 List, create, update, delete, and subscribe to shares 131 """ 132 return self._shares_service 133 134 @property 135 def users(self) -> UserService: 136 """ 137 List and update user information 138 """ 139 return self._users_service 140 141 @property 142 def file(self) -> FileService: 143 """ 144 Read, download, and create file objects 145 """ 146 return self._file_service 147 148 @property 149 def api_client(self) -> CirroApiClient: 150 """ 151 Gets the underlying API client 152 """ 153 return self._api_client 154 155 @property 156 def configuration(self) -> AppConfig: 157 """ 158 Gets the configuration of the instance 159 """ 160 return self._configuration
Client for interacting directly with the Cirro API
16 def __init__(self, auth_info: AuthInfo = None, base_url: str = None): 17 """ 18 Instantiates the Cirro API object 19 20 Args: 21 auth_info (cirro.auth.base.AuthInfo): 22 base_url (str): Optional base URL of the Cirro instance 23 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 24 25 Returns: 26 Authenticated Cirro API object, which can be used to call endpoint functions. 27 28 Example: 29 ```python 30 from cirro.cirro_client import CirroApi 31 32 cirro = CirroApi(base_url="app.cirro.bio") 33 print(cirro.projects.list()) 34 ``` 35 """ 36 37 self._configuration = AppConfig(base_url=base_url) 38 if not auth_info: 39 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 40 41 self._api_client = CirroApiClient( 42 base_url=self._configuration.rest_endpoint, 43 auth_method=auth_info.get_auth_method(), 44 client_name='Cirro SDK', 45 package_name='cirro' 46 ) 47 48 # Init services 49 self._file_service = FileService(self._api_client, 50 checksum_method=self._configuration.checksum_method, 51 transfer_retries=self._configuration.transfer_max_retries) 52 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 53 self._project_service = ProjectService(self._api_client) 54 self._process_service = ProcessService(self._api_client) 55 self._execution_service = ExecutionService(self._api_client) 56 self._compute_environment_service = ComputeEnvironmentService(self._api_client) 57 self._metrics_service = MetricsService(self._api_client) 58 self._metadata_service = MetadataService(self._api_client) 59 self._billing_service = BillingService(self._api_client) 60 self._references_service = ReferenceService(self._api_client, file_service=self._file_service) 61 self._shares_service = ShareService(self._api_client) 62 self._users_service = UserService(self._api_client)
Instantiates the Cirro API object
Arguments:
- auth_info (cirro.auth.base.AuthInfo):
- base_url (str): Optional base URL of the Cirro instance
(if not provided, it uses the
CIRRO_BASE_URL
environment variable, or the config file)
Returns:
Authenticated Cirro API object, which can be used to call endpoint functions.
Example:
from cirro.cirro_client import CirroApi
cirro = CirroApi(base_url="app.cirro.bio")
print(cirro.projects.list())
64 @property 65 def datasets(self) -> DatasetService: 66 """ 67 Create, list, delete, and modify Datasets 68 """ 69 return self._dataset_service
Create, list, delete, and modify Datasets
71 @property 72 def projects(self) -> ProjectService: 73 """ 74 Create, list, delete, and modify Projects 75 """ 76 return self._project_service
Create, list, delete, and modify Projects
78 @property 79 def processes(self) -> ProcessService: 80 """ 81 List and retrieve detailed information about Processes 82 """ 83 return self._process_service
List and retrieve detailed information about Processes
85 @property 86 def execution(self) -> ExecutionService: 87 """ 88 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 89 """ 90 return self._execution_service
List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)
92 @property 93 def compute_environments(self) -> ComputeEnvironmentService: 94 """ 95 List and update compute environments 96 """ 97 return self._compute_environment_service
List and update compute environments
99 @property 100 def metrics(self) -> MetricsService: 101 """ 102 Project-level summary metrics 103 """ 104 return self._metrics_service
Project-level summary metrics
106 @property 107 def metadata(self) -> MetadataService: 108 """ 109 List and modify Sample metadata or metadata schemas 110 """ 111 return self._metadata_service
List and modify Sample metadata or metadata schemas
113 @property 114 def billing(self) -> BillingService: 115 """ 116 List and update billing accounts 117 """ 118 return self._billing_service
List and update billing accounts
120 @property 121 def references(self) -> ReferenceService: 122 """ 123 List References and Reference types 124 """ 125 return self._references_service
List References and Reference types
134 @property 135 def users(self) -> UserService: 136 """ 137 List and update user information 138 """ 139 return self._users_service
List and update user information
141 @property 142 def file(self) -> FileService: 143 """ 144 Read, download, and create file objects 145 """ 146 return self._file_service
Read, download, and create file objects