cirro
1import cirro.file_utils # noqa 2from cirro.cirro_client import CirroApi 3from cirro.sdk.dataset import DataPortalDataset 4from cirro.sdk.login import DataPortalLogin 5from cirro.sdk.portal import DataPortal 6from cirro.sdk.process import DataPortalProcess 7from cirro.sdk.project import DataPortalProject 8from cirro.sdk.reference import DataPortalReference 9 10__all__ = [ 11 'DataPortal', 12 'DataPortalLogin', 13 'DataPortalProject', 14 'DataPortalProcess', 15 'DataPortalDataset', 16 'DataPortalReference', 17 'CirroApi', 18 'file_utils' 19]
13class DataPortal: 14 """ 15 Helper functions for exploring the Projects, Datasets, Samples, and Files 16 available in the Data Portal. 17 """ 18 19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url) 43 44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 ) 53 54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name) 58 59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id) 63 64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project) 78 79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 return project.get_dataset(dataset) 105 106 def read_files( 107 self, 108 project: str, 109 dataset: str, 110 glob: str = None, 111 pattern: str = None, 112 filetype: str = None, 113 **kwargs 114 ): 115 """ 116 Read the contents of files from a dataset. 117 118 The project and dataset can each be identified by name or ID. 119 Exactly one of ``glob`` or ``pattern`` must be provided. 120 121 **glob** — standard wildcard matching; yields the file content for each 122 matching file: 123 124 - ``*`` matches any characters within a single path segment 125 - ``**`` matches zero or more path segments 126 - Matching is suffix-anchored (``*.csv`` matches at any depth) 127 128 **pattern** — like ``glob`` but ``{name}`` placeholders capture portions 129 of the path automatically; yields ``(content, meta)`` pairs where 130 *meta* is a ``dict`` of extracted values: 131 132 - ``{name}`` captures one path segment (no ``/``) 133 - ``*`` and ``**`` wildcards work as in ``glob`` 134 135 Args: 136 project (str): ID or name of the project. 137 dataset (str): ID or name of the dataset. 138 glob (str): Wildcard expression to match files 139 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 140 Yields one item per matching file: the parsed content. 141 pattern (str): Wildcard expression with ``{name}`` capture 142 placeholders (e.g., ``'{sample}.csv'``, 143 ``'{condition}/{sample}.csv'``). 144 Yields ``(content, meta)`` per matching file. 145 filetype (str): File format used to parse each file. Supported values: 146 147 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` 148 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) 149 - ``'json'``: parse with :func:`json.loads`, returns a Python object 150 - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` 151 (requires ``pyarrow`` or ``fastparquet``) 152 - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` 153 (requires ``pyarrow``) 154 - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object 155 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` 156 (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) 157 - ``'text'``: read as plain text, returns a ``str`` 158 - ``'bytes'``: read as raw bytes, returns ``bytes`` 159 - ``None`` (default): infer from file extension 160 (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, 161 ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, 162 ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, 163 ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) 164 **kwargs: Additional keyword arguments forwarded to the file-parsing 165 function (e.g., ``sep='\\t'`` for CSV/TSV files). 166 167 Yields: 168 - When using ``glob``: *content* for each matching file 169 - When using ``pattern``: ``(content, meta)`` for each matching file, 170 where *meta* is a ``dict`` of values extracted from ``{name}`` 171 placeholders 172 173 Raises: 174 DataPortalInputError: if both ``glob`` and ``pattern`` are provided, 175 or if neither is provided. 176 177 Example: 178 ```python 179 # Read all CSV files — just the content 180 for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): 181 print(df.shape) 182 183 # Extract sample names from filenames automatically 184 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): 185 print(meta['sample'], df.shape) 186 187 # Multi-level capture: condition directory + sample filename 188 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): 189 print(meta['condition'], meta['sample'], df.shape) 190 191 # Read gzip-compressed TSV files with explicit separator 192 for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): 193 print(df.shape) 194 ``` 195 """ 196 ds = self.get_dataset(project=project, dataset=dataset) 197 yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs) 198 199 def read_file( 200 self, 201 project: str, 202 dataset: str, 203 path: str = None, 204 glob: str = None, 205 filetype: str = None, 206 **kwargs 207 ): 208 """ 209 Read the contents of a single file from a dataset. 210 211 The project and dataset can each be identified by name or ID. 212 Provide either ``path`` (exact relative path) or ``glob`` (wildcard 213 expression). If ``glob`` is used it must match exactly one file. 214 215 Args: 216 project (str): ID or name of the project. 217 dataset (str): ID or name of the dataset. 218 path (str): Exact relative path of the file within the dataset. 219 glob (str): Wildcard expression matching exactly one file. 220 filetype (str): File format used to parse the file. Supported values 221 are the same as :meth:`read_files`. 222 **kwargs: Additional keyword arguments forwarded to the 223 file-parsing function. 224 225 Returns: 226 Parsed file content. 227 228 Raises: 229 DataPortalInputError: if both or neither of ``path``/``glob`` are 230 provided, or if ``glob`` matches zero or more than one file. 231 """ 232 ds = self.get_dataset(project=project, dataset=dataset) 233 return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs) 234 235 def list_processes(self, ingest=False) -> DataPortalProcesses: 236 """ 237 List all the processes available in the Data Portal. 238 By default, only list non-ingest processes (those which can be run on existing datasets). 239 To list the processes which can be used to upload datasets, use `ingest = True`. 240 241 Args: 242 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 243 """ 244 245 return DataPortalProcesses( 246 [ 247 DataPortalProcess(p, self._client) 248 for p in self._client.processes.list() 249 if not ingest or p.executor == Executor.INGEST 250 ] 251 ) 252 253 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 254 """ 255 Return the process with the specified name. 256 257 Args: 258 name (str): Name of process 259 """ 260 261 return self.list_processes(ingest=ingest).get_by_name(name) 262 263 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 264 """ 265 Return the process with the specified id 266 267 Args: 268 id (str): ID of process 269 """ 270 271 return self.list_processes(ingest=ingest).get_by_id(id) 272 273 def list_reference_types(self) -> DataPortalReferenceTypes: 274 """ 275 Return the list of all available reference types 276 """ 277 278 return DataPortalReferenceTypes( 279 [ 280 DataPortalReferenceType(ref) 281 for ref in self._client.references.get_types() 282 ] 283 ) 284 285 @property 286 def developer_helper(self) -> DeveloperHelper: 287 return DeveloperHelper(self._client)
Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.
19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url)
Set up the DataPortal object, establishing an authenticated connection.
Arguments:
- base_url (str): Optional base URL of the Cirro instance
(if not provided, it uses the
CIRRO_BASE_URLenvironment variable, or the config file) - client (
cirro.cirro_client.CirroApi): Optional pre-configured client
Example:
from cirro import DataPortal
portal = DataPortal(base_url="app.cirro.bio")
portal.list_projects()
44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 )
List all the projects available in the Data Portal.
54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name)
Return the project with the specified name.
59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id)
Return the project with the specified id.
64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project)
Return a project identified by ID or name.
Arguments:
- project (str): ID or name of project
Returns:
from cirro.sdk.project import DataPortalProject
79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 return project.get_dataset(dataset)
Return a dataset identified by ID or name.
Arguments:
- project (str): ID or name of project
- dataset (str): ID or name of dataset
Returns:
cirro.sdk.dataset.DataPortalDatasetfrom cirro import DataPortal() portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", dataset="id-or-name-of-dataset" )
106 def read_files( 107 self, 108 project: str, 109 dataset: str, 110 glob: str = None, 111 pattern: str = None, 112 filetype: str = None, 113 **kwargs 114 ): 115 """ 116 Read the contents of files from a dataset. 117 118 The project and dataset can each be identified by name or ID. 119 Exactly one of ``glob`` or ``pattern`` must be provided. 120 121 **glob** — standard wildcard matching; yields the file content for each 122 matching file: 123 124 - ``*`` matches any characters within a single path segment 125 - ``**`` matches zero or more path segments 126 - Matching is suffix-anchored (``*.csv`` matches at any depth) 127 128 **pattern** — like ``glob`` but ``{name}`` placeholders capture portions 129 of the path automatically; yields ``(content, meta)`` pairs where 130 *meta* is a ``dict`` of extracted values: 131 132 - ``{name}`` captures one path segment (no ``/``) 133 - ``*`` and ``**`` wildcards work as in ``glob`` 134 135 Args: 136 project (str): ID or name of the project. 137 dataset (str): ID or name of the dataset. 138 glob (str): Wildcard expression to match files 139 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 140 Yields one item per matching file: the parsed content. 141 pattern (str): Wildcard expression with ``{name}`` capture 142 placeholders (e.g., ``'{sample}.csv'``, 143 ``'{condition}/{sample}.csv'``). 144 Yields ``(content, meta)`` per matching file. 145 filetype (str): File format used to parse each file. Supported values: 146 147 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` 148 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) 149 - ``'json'``: parse with :func:`json.loads`, returns a Python object 150 - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` 151 (requires ``pyarrow`` or ``fastparquet``) 152 - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` 153 (requires ``pyarrow``) 154 - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object 155 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` 156 (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) 157 - ``'text'``: read as plain text, returns a ``str`` 158 - ``'bytes'``: read as raw bytes, returns ``bytes`` 159 - ``None`` (default): infer from file extension 160 (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, 161 ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, 162 ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, 163 ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) 164 **kwargs: Additional keyword arguments forwarded to the file-parsing 165 function (e.g., ``sep='\\t'`` for CSV/TSV files). 166 167 Yields: 168 - When using ``glob``: *content* for each matching file 169 - When using ``pattern``: ``(content, meta)`` for each matching file, 170 where *meta* is a ``dict`` of values extracted from ``{name}`` 171 placeholders 172 173 Raises: 174 DataPortalInputError: if both ``glob`` and ``pattern`` are provided, 175 or if neither is provided. 176 177 Example: 178 ```python 179 # Read all CSV files — just the content 180 for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): 181 print(df.shape) 182 183 # Extract sample names from filenames automatically 184 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): 185 print(meta['sample'], df.shape) 186 187 # Multi-level capture: condition directory + sample filename 188 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): 189 print(meta['condition'], meta['sample'], df.shape) 190 191 # Read gzip-compressed TSV files with explicit separator 192 for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): 193 print(df.shape) 194 ``` 195 """ 196 ds = self.get_dataset(project=project, dataset=dataset) 197 yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
Read the contents of files from a dataset.
The project and dataset can each be identified by name or ID.
Exactly one of glob or pattern must be provided.
glob — standard wildcard matching; yields the file content for each matching file:
*matches any characters within a single path segment**matches zero or more path segments- Matching is suffix-anchored (
*.csvmatches at any depth)
pattern — like glob but {name} placeholders capture portions
of the path automatically; yields (content, meta) pairs where
meta is a dict of extracted values:
{name}captures one path segment (no/)*and**wildcards work as inglob
Arguments:
- project (str): ID or name of the project.
- dataset (str): ID or name of the dataset.
- glob (str): Wildcard expression to match files
(e.g.,
'*.csv','data/**/*.tsv.gz'). Yields one item per matching file: the parsed content. - pattern (str): Wildcard expression with
{name}capture placeholders (e.g.,'{sample}.csv','{condition}/{sample}.csv'). Yields(content, meta)per matching file. filetype (str): File format used to parse each file. Supported values:
'csv': parse withpandas.read_csv(), returns aDataFrame'h5ad': parse as AnnData (requiresanndatapackage)'json': parse withjson.loads(), returns a Python object'parquet': parse withpandas.read_parquet(), returns aDataFrame(requirespyarroworfastparquet)'feather': parse withpandas.read_feather(), returns aDataFrame(requirespyarrow)'pickle': deserialize withpickle, returns a Python object'excel': parse withpandas.read_excel(), returns aDataFrame(requiresopenpyxlfor.xlsxorxlrdfor.xls)'text': read as plain text, returns astr'bytes': read as raw bytes, returnsbytesNone(default): infer from file extension (.csv/.tsv→'csv',.h5ad→'h5ad',.json→'json',.parquet→'parquet',.feather→'feather',.pkl/.pickle→'pickle',.xlsx/.xls→'excel', otherwise'text')
- **kwargs: Additional keyword arguments forwarded to the file-parsing
function (e.g.,
sep='\t'for CSV/TSV files).
Yields:
- When using
glob: content for each matching file- When using
pattern:(content, meta)for each matching file, where meta is adictof values extracted from{name}placeholders
Raises:
- DataPortalInputError: if both
globandpatternare provided, or if neither is provided.
Example:
# Read all CSV files — just the content for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): print(df.shape) # Extract sample names from filenames automatically for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): print(meta['sample'], df.shape) # Multi-level capture: condition directory + sample filename for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): print(meta['condition'], meta['sample'], df.shape) # Read gzip-compressed TSV files with explicit separator for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\t'): print(df.shape)
199 def read_file( 200 self, 201 project: str, 202 dataset: str, 203 path: str = None, 204 glob: str = None, 205 filetype: str = None, 206 **kwargs 207 ): 208 """ 209 Read the contents of a single file from a dataset. 210 211 The project and dataset can each be identified by name or ID. 212 Provide either ``path`` (exact relative path) or ``glob`` (wildcard 213 expression). If ``glob`` is used it must match exactly one file. 214 215 Args: 216 project (str): ID or name of the project. 217 dataset (str): ID or name of the dataset. 218 path (str): Exact relative path of the file within the dataset. 219 glob (str): Wildcard expression matching exactly one file. 220 filetype (str): File format used to parse the file. Supported values 221 are the same as :meth:`read_files`. 222 **kwargs: Additional keyword arguments forwarded to the 223 file-parsing function. 224 225 Returns: 226 Parsed file content. 227 228 Raises: 229 DataPortalInputError: if both or neither of ``path``/``glob`` are 230 provided, or if ``glob`` matches zero or more than one file. 231 """ 232 ds = self.get_dataset(project=project, dataset=dataset) 233 return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
Read the contents of a single file from a dataset.
The project and dataset can each be identified by name or ID.
Provide either path (exact relative path) or glob (wildcard
expression). If glob is used it must match exactly one file.
Arguments:
- project (str): ID or name of the project.
- dataset (str): ID or name of the dataset.
- path (str): Exact relative path of the file within the dataset.
- glob (str): Wildcard expression matching exactly one file.
- filetype (str): File format used to parse the file. Supported values
are the same as
read_files(). - **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Returns:
Parsed file content.
Raises:
- DataPortalInputError: if both or neither of
path/globare provided, or ifglobmatches zero or more than one file.
235 def list_processes(self, ingest=False) -> DataPortalProcesses: 236 """ 237 List all the processes available in the Data Portal. 238 By default, only list non-ingest processes (those which can be run on existing datasets). 239 To list the processes which can be used to upload datasets, use `ingest = True`. 240 241 Args: 242 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 243 """ 244 245 return DataPortalProcesses( 246 [ 247 DataPortalProcess(p, self._client) 248 for p in self._client.processes.list() 249 if not ingest or p.executor == Executor.INGEST 250 ] 251 )
List all the processes available in the Data Portal.
By default, only list non-ingest processes (those which can be run on existing datasets).
To list the processes which can be used to upload datasets, use ingest = True.
Arguments:
- ingest (bool): If True, only list those processes which can be used to ingest datasets directly
253 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 254 """ 255 Return the process with the specified name. 256 257 Args: 258 name (str): Name of process 259 """ 260 261 return self.list_processes(ingest=ingest).get_by_name(name)
Return the process with the specified name.
Arguments:
- name (str): Name of process
263 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 264 """ 265 Return the process with the specified id 266 267 Args: 268 id (str): ID of process 269 """ 270 271 return self.list_processes(ingest=ingest).get_by_id(id)
Return the process with the specified id
Arguments:
- id (str): ID of process
273 def list_reference_types(self) -> DataPortalReferenceTypes: 274 """ 275 Return the list of all available reference types 276 """ 277 278 return DataPortalReferenceTypes( 279 [ 280 DataPortalReferenceType(ref) 281 for ref in self._client.references.get_types() 282 ] 283 )
Return the list of all available reference types
8class DataPortalLogin: 9 """ 10 Start the login process, obtaining the authorization message from Cirro 11 needed to confirm the user identity. 12 13 Useful when you need to authenticate a user in a non-blocking way. 14 15 Usage: 16 17 ```python 18 # Replace app.cirro.bio as appropriate 19 login = DataPortalLogin(base_url="app.cirro.bio") 20 21 # Present the user with the authorization message 22 print(login.auth_message) 23 24 # Generate the authenticated DataPortal object, 25 # blocking until the user completes the login process in their browser 26 portal = login.await_completion() 27 ``` 28 """ 29 base_url: str 30 auth_info: DeviceCodeAuth 31 32 def __init__(self, base_url: str = None, enable_cache=False): 33 app_config = AppConfig(base_url=base_url) 34 35 self.base_url = base_url 36 37 self.auth_info = DeviceCodeAuth( 38 region=app_config.region, 39 client_id=app_config.client_id, 40 auth_endpoint=app_config.auth_endpoint, 41 enable_cache=enable_cache, 42 await_completion=False 43 ) 44 45 @property 46 def auth_message(self) -> str: 47 """Authorization message provided by Cirro.""" 48 return self.auth_info.auth_message 49 50 @property 51 def auth_message_markdown(self) -> str: 52 """Authorization message provided by Cirro (Markdown format).""" 53 return self.auth_info.auth_message_markdown 54 55 def await_completion(self) -> DataPortal: 56 """Complete the login process and return an authenticated client""" 57 58 # Block until the user completes the login flow 59 self.auth_info.await_completion() 60 61 # Set up the client object 62 cirro_client = CirroApi( 63 auth_info=self.auth_info, 64 base_url=self.base_url 65 ) 66 67 # Return the Data Portal object 68 return DataPortal(client=cirro_client)
Start the login process, obtaining the authorization message from Cirro needed to confirm the user identity.
Useful when you need to authenticate a user in a non-blocking way.
Usage:
# Replace app.cirro.bio as appropriate
login = DataPortalLogin(base_url="app.cirro.bio")
# Present the user with the authorization message
print(login.auth_message)
# Generate the authenticated DataPortal object,
# blocking until the user completes the login process in their browser
portal = login.await_completion()
32 def __init__(self, base_url: str = None, enable_cache=False): 33 app_config = AppConfig(base_url=base_url) 34 35 self.base_url = base_url 36 37 self.auth_info = DeviceCodeAuth( 38 region=app_config.region, 39 client_id=app_config.client_id, 40 auth_endpoint=app_config.auth_endpoint, 41 enable_cache=enable_cache, 42 await_completion=False 43 )
45 @property 46 def auth_message(self) -> str: 47 """Authorization message provided by Cirro.""" 48 return self.auth_info.auth_message
Authorization message provided by Cirro.
50 @property 51 def auth_message_markdown(self) -> str: 52 """Authorization message provided by Cirro (Markdown format).""" 53 return self.auth_info.auth_message_markdown
Authorization message provided by Cirro (Markdown format).
55 def await_completion(self) -> DataPortal: 56 """Complete the login process and return an authenticated client""" 57 58 # Block until the user completes the login flow 59 self.auth_info.await_completion() 60 61 # Set up the client object 62 cirro_client = CirroApi( 63 auth_info=self.auth_info, 64 base_url=self.base_url 65 ) 66 67 # Return the Data Portal object 68 return DataPortal(client=cirro_client)
Complete the login process and return an authenticated client
20class DataPortalProject(DataPortalAsset): 21 """ 22 Projects in the Data Portal contain collections of Datasets. 23 Users are granted permissions at the project-level, allowing them 24 to view and/or modify all the datasets in that collection. 25 """ 26 def __init__(self, proj: Project, client: CirroApi): 27 """ 28 Instantiate with helper method 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 project = portal.get_project_by_name("Project Name") 34 ``` 35 36 """ 37 self._data = proj 38 self._client = client 39 40 @property 41 def id(self) -> str: 42 """ 43 Unique identifier 44 """ 45 return self._data.id 46 47 @property 48 def name(self) -> str: 49 """ 50 Readable name 51 """ 52 return self._data.name 53 54 @property 55 def description(self) -> str: 56 """ 57 Longer description of the project 58 """ 59 return self._data.description 60 61 @property 62 def status(self) -> Status: 63 """ 64 Status of the project 65 """ 66 return self._data.status 67 68 def __str__(self): 69 """Control how the Project is rendered as a string.""" 70 71 return '\n'.join([ 72 f"{i.title()}: {self.__getattribute__(i)}" 73 for i in ['name', 'id', 'description'] 74 ]) 75 76 @cache 77 def _get_datasets(self) -> List[Dataset]: 78 return list_all_datasets(project_id=self.id, 79 client=self._client) 80 81 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 82 """List all the datasets available in the project.""" 83 if force_refresh: 84 self._get_datasets.cache_clear() 85 86 return DataPortalDatasets( 87 [ 88 DataPortalDataset(d, self._client) 89 for d in self._get_datasets() 90 ] 91 ) 92 93 def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset: 94 """Return the dataset matching the given ID or name. 95 96 Tries to match by ID first, then by name. 97 Raises an error if the name matches multiple datasets. 98 """ 99 if force_refresh: 100 self._get_datasets.cache_clear() 101 102 # Try by ID first 103 try: 104 return self.get_dataset_by_id(name_or_id) 105 except Exception: 106 pass 107 108 # Fall back to name matching 109 matches = [d for d in self._get_datasets() if d.name == name_or_id] 110 if len(matches) == 0: 111 raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found') 112 if len(matches) > 1: 113 raise DataPortalInputError( 114 f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead' 115 ) 116 return self.get_dataset_by_id(matches[0].id) 117 118 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 119 """Return the dataset with the specified name.""" 120 if force_refresh: 121 self._get_datasets.cache_clear() 122 123 dataset = next((d for d in self._get_datasets() if d.name == name), None) 124 if dataset is None: 125 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 126 return self.get_dataset_by_id(dataset.id) 127 128 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 129 """Return the dataset with the specified id.""" 130 131 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 132 if dataset is None: 133 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 134 return DataPortalDataset(dataset, self._client) 135 136 def list_references(self, reference_type: str = None) -> DataPortalReferences: 137 """ 138 List the references available in a project. 139 Optionally filter to references of a particular type (identified by name) 140 """ 141 142 # Get the complete list of references which are available 143 reference_types = DataPortalReferenceTypes( 144 [ 145 DataPortalReferenceType(ref) 146 for ref in self._client.references.get_types() 147 ] 148 ) 149 150 # If a particular name was specified 151 if reference_type is not None: 152 reference_types = reference_types.filter_by_pattern(reference_type) 153 if len(reference_types) == 0: 154 msg = f"Could not find any reference types with the name {reference_type}" 155 raise DataPortalAssetNotFound(msg) 156 157 return DataPortalReferences( 158 [ 159 DataPortalReference(ref, project_id=self.id, client=self._client) 160 for ref in self._client.references.get_for_project( 161 self.id 162 ) 163 if reference_type is None or ref.type_ == reference_type 164 ] 165 ) 166 167 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 168 """Return the reference of a particular type with the specified name.""" 169 170 if name is None: 171 raise DataPortalInputError("Must specify the reference name") 172 173 return self.list_references(ref_type).get_by_name(name) 174 175 def upload_dataset( 176 self, 177 name: str = None, 178 description='', 179 process: Union[DataPortalProcess, str] = None, 180 upload_folder: str = None, 181 files: List[str] = None, 182 tags: List[str] = None, 183 ): 184 """ 185 Upload a set of files to the Data Portal, creating a new dataset. 186 187 If the files parameter is not provided, it will upload all files in the upload folder 188 189 Args: 190 name (str): Name of newly created dataset 191 description (str): Description of newly created dataset 192 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 193 upload_folder (str): Folder containing files to upload 194 files (List[str]): Optional subset of files to upload from the folder 195 tags (List[str]): Optional list of tags to apply to the dataset 196 """ 197 198 if name is None: 199 raise DataPortalInputError("Must provide name for new dataset") 200 if process is None: 201 raise DataPortalInputError("Must provide the process which is used for ingest") 202 if upload_folder is None: 203 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 204 205 # Parse the process provided by the user 206 process = parse_process_name_or_id(process, self._client) 207 208 # If no files were provided 209 if files is None: 210 # Get the list of files in the upload folder 211 files = get_files_in_directory(upload_folder) 212 213 if files is None or len(files) == 0: 214 raise RuntimeWarning("No files to upload, exiting") 215 216 # Normalize into Tag object 217 if tags is not None: 218 tags = [Tag(value=value) for value in tags] 219 220 # Make sure that the files match the expected pattern 221 self._client.processes.check_dataset_files(files, process.id, upload_folder) 222 223 # Create the ingest process request 224 dataset_create_request = UploadDatasetRequest( 225 process_id=process.id, 226 name=name, 227 description=description, 228 expected_files=files, 229 tags=tags, 230 ) 231 232 # Get the response 233 create_response = self._client.datasets.create(project_id=self.id, 234 upload_request=dataset_create_request) 235 236 # Upload the files 237 self._client.datasets.upload_files( 238 project_id=self.id, 239 dataset_id=create_response.id, 240 directory=upload_folder, 241 files=files 242 ) 243 244 # Return the dataset which was created, which might take a second to update 245 max_attempts = 5 246 for attempt in range(max_attempts): 247 try: 248 return self.get_dataset_by_id(create_response.id) 249 except DataPortalAssetNotFound as e: 250 if attempt == max_attempts - 1: 251 raise e 252 else: 253 sleep(2) 254 255 def samples(self, max_items: int = 10000) -> List[Sample]: 256 """ 257 Retrieves a list of samples associated with a project along with their metadata 258 259 Args: 260 max_items (int): Maximum number of records to get (default 10,000) 261 """ 262 return self._client.metadata.get_project_samples(self.id, max_items)
Projects in the Data Portal contain collections of Datasets. Users are granted permissions at the project-level, allowing them to view and/or modify all the datasets in that collection.
26 def __init__(self, proj: Project, client: CirroApi): 27 """ 28 Instantiate with helper method 29 30 ```python 31 from cirro import DataPortal() 32 portal = DataPortal() 33 project = portal.get_project_by_name("Project Name") 34 ``` 35 36 """ 37 self._data = proj 38 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
54 @property 55 def description(self) -> str: 56 """ 57 Longer description of the project 58 """ 59 return self._data.description
Longer description of the project
61 @property 62 def status(self) -> Status: 63 """ 64 Status of the project 65 """ 66 return self._data.status
Status of the project
81 def list_datasets(self, force_refresh=False) -> DataPortalDatasets: 82 """List all the datasets available in the project.""" 83 if force_refresh: 84 self._get_datasets.cache_clear() 85 86 return DataPortalDatasets( 87 [ 88 DataPortalDataset(d, self._client) 89 for d in self._get_datasets() 90 ] 91 )
List all the datasets available in the project.
93 def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset: 94 """Return the dataset matching the given ID or name. 95 96 Tries to match by ID first, then by name. 97 Raises an error if the name matches multiple datasets. 98 """ 99 if force_refresh: 100 self._get_datasets.cache_clear() 101 102 # Try by ID first 103 try: 104 return self.get_dataset_by_id(name_or_id) 105 except Exception: 106 pass 107 108 # Fall back to name matching 109 matches = [d for d in self._get_datasets() if d.name == name_or_id] 110 if len(matches) == 0: 111 raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found') 112 if len(matches) > 1: 113 raise DataPortalInputError( 114 f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead' 115 ) 116 return self.get_dataset_by_id(matches[0].id)
Return the dataset matching the given ID or name.
Tries to match by ID first, then by name. Raises an error if the name matches multiple datasets.
118 def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: 119 """Return the dataset with the specified name.""" 120 if force_refresh: 121 self._get_datasets.cache_clear() 122 123 dataset = next((d for d in self._get_datasets() if d.name == name), None) 124 if dataset is None: 125 raise DataPortalAssetNotFound(f'Dataset with name {name} not found') 126 return self.get_dataset_by_id(dataset.id)
Return the dataset with the specified name.
128 def get_dataset_by_id(self, _id: str = None) -> DataPortalDataset: 129 """Return the dataset with the specified id.""" 130 131 dataset = self._client.datasets.get(project_id=self.id, dataset_id=_id) 132 if dataset is None: 133 raise DataPortalAssetNotFound(f'Dataset with ID {_id} not found') 134 return DataPortalDataset(dataset, self._client)
Return the dataset with the specified id.
136 def list_references(self, reference_type: str = None) -> DataPortalReferences: 137 """ 138 List the references available in a project. 139 Optionally filter to references of a particular type (identified by name) 140 """ 141 142 # Get the complete list of references which are available 143 reference_types = DataPortalReferenceTypes( 144 [ 145 DataPortalReferenceType(ref) 146 for ref in self._client.references.get_types() 147 ] 148 ) 149 150 # If a particular name was specified 151 if reference_type is not None: 152 reference_types = reference_types.filter_by_pattern(reference_type) 153 if len(reference_types) == 0: 154 msg = f"Could not find any reference types with the name {reference_type}" 155 raise DataPortalAssetNotFound(msg) 156 157 return DataPortalReferences( 158 [ 159 DataPortalReference(ref, project_id=self.id, client=self._client) 160 for ref in self._client.references.get_for_project( 161 self.id 162 ) 163 if reference_type is None or ref.type_ == reference_type 164 ] 165 )
List the references available in a project. Optionally filter to references of a particular type (identified by name)
167 def get_reference_by_name(self, name: str = None, ref_type: str = None) -> DataPortalReference: 168 """Return the reference of a particular type with the specified name.""" 169 170 if name is None: 171 raise DataPortalInputError("Must specify the reference name") 172 173 return self.list_references(ref_type).get_by_name(name)
Return the reference of a particular type with the specified name.
175 def upload_dataset( 176 self, 177 name: str = None, 178 description='', 179 process: Union[DataPortalProcess, str] = None, 180 upload_folder: str = None, 181 files: List[str] = None, 182 tags: List[str] = None, 183 ): 184 """ 185 Upload a set of files to the Data Portal, creating a new dataset. 186 187 If the files parameter is not provided, it will upload all files in the upload folder 188 189 Args: 190 name (str): Name of newly created dataset 191 description (str): Description of newly created dataset 192 process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object 193 upload_folder (str): Folder containing files to upload 194 files (List[str]): Optional subset of files to upload from the folder 195 tags (List[str]): Optional list of tags to apply to the dataset 196 """ 197 198 if name is None: 199 raise DataPortalInputError("Must provide name for new dataset") 200 if process is None: 201 raise DataPortalInputError("Must provide the process which is used for ingest") 202 if upload_folder is None: 203 raise DataPortalInputError("Must provide upload_folder -- folder containing files to upload") 204 205 # Parse the process provided by the user 206 process = parse_process_name_or_id(process, self._client) 207 208 # If no files were provided 209 if files is None: 210 # Get the list of files in the upload folder 211 files = get_files_in_directory(upload_folder) 212 213 if files is None or len(files) == 0: 214 raise RuntimeWarning("No files to upload, exiting") 215 216 # Normalize into Tag object 217 if tags is not None: 218 tags = [Tag(value=value) for value in tags] 219 220 # Make sure that the files match the expected pattern 221 self._client.processes.check_dataset_files(files, process.id, upload_folder) 222 223 # Create the ingest process request 224 dataset_create_request = UploadDatasetRequest( 225 process_id=process.id, 226 name=name, 227 description=description, 228 expected_files=files, 229 tags=tags, 230 ) 231 232 # Get the response 233 create_response = self._client.datasets.create(project_id=self.id, 234 upload_request=dataset_create_request) 235 236 # Upload the files 237 self._client.datasets.upload_files( 238 project_id=self.id, 239 dataset_id=create_response.id, 240 directory=upload_folder, 241 files=files 242 ) 243 244 # Return the dataset which was created, which might take a second to update 245 max_attempts = 5 246 for attempt in range(max_attempts): 247 try: 248 return self.get_dataset_by_id(create_response.id) 249 except DataPortalAssetNotFound as e: 250 if attempt == max_attempts - 1: 251 raise e 252 else: 253 sleep(2)
Upload a set of files to the Data Portal, creating a new dataset.
If the files parameter is not provided, it will upload all files in the upload folder
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (str | DataPortalProcess): Process to run may be referenced by name, ID, or object
- upload_folder (str): Folder containing files to upload
- files (List[str]): Optional subset of files to upload from the folder
- tags (List[str]): Optional list of tags to apply to the dataset
255 def samples(self, max_items: int = 10000) -> List[Sample]: 256 """ 257 Retrieves a list of samples associated with a project along with their metadata 258 259 Args: 260 max_items (int): Maximum number of records to get (default 10,000) 261 """ 262 return self._client.metadata.get_project_samples(self.id, max_items)
Retrieves a list of samples associated with a project along with their metadata
Arguments:
- max_items (int): Maximum number of records to get (default 10,000)
13class DataPortalProcess(DataPortalAsset): 14 """Helper functions for interacting with analysis processes.""" 15 16 def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi): 17 """ 18 Instantiate with helper method 19 20 ```python 21 from cirro import DataPortal() 22 portal = DataPortal() 23 process = portal.get_process_by_name("Process Name") 24 ``` 25 """ 26 self._data = process 27 self._client = client 28 29 @property 30 def id(self) -> str: 31 """Unique identifier""" 32 return self._data.id 33 34 @property 35 def name(self) -> str: 36 """Readable name""" 37 return self._data.name 38 39 @property 40 def description(self) -> str: 41 """Longer description of process""" 42 return self._data.description 43 44 @property 45 def child_process_ids(self) -> List[str]: 46 """List of processes which can be run on the output of this process""" 47 return self._data.child_process_ids 48 49 @property 50 def executor(self) -> Executor: 51 """INGEST, CROMWELL, or NEXTFLOW""" 52 return self._data.executor 53 54 @property 55 def category(self) -> str: 56 """Category of process""" 57 return self._data.category 58 59 @property 60 def pipeline_type(self) -> str: 61 """Pipeline type""" 62 return self._data.pipeline_type 63 64 @property 65 def documentation_url(self) -> str: 66 """Documentation URL""" 67 return self._data.documentation_url 68 69 @property 70 def file_requirements_message(self) -> str: 71 """Description of files required for INGEST processes""" 72 return self._data.file_requirements_message 73 74 @property 75 def code(self) -> PipelineCode: 76 """Pipeline code configuration""" 77 return self._get_detail().pipeline_code 78 79 @property 80 def custom_settings(self) -> CustomPipelineSettings: 81 """Custom settings for the process""" 82 return self._get_detail().custom_settings 83 84 def _get_detail(self) -> ProcessDetail: 85 if not isinstance(self._data, ProcessDetail): 86 self._data = self._client.processes.get(self.id) 87 return self._data 88 89 def __str__(self): 90 return '\n'.join([ 91 f"{i.title()}: {self.__getattribute__(i)}" 92 for i in ['name', 'id', 'description'] 93 ]) 94 95 def get_parameter_spec(self) -> ParameterSpecification: 96 """ 97 Gets a specification used to describe the parameters used in the process. 98 """ 99 return self._client.processes.get_parameter_spec(self.id) 100 101 def run_analysis( 102 self, 103 name: str = None, 104 project_id: str = None, 105 datasets: list = None, 106 description: str = "", 107 params=None, 108 notifications_emails: List[str] = None, 109 compute_environment: str = None, 110 resume_dataset_id: str = None, 111 source_sample_ids: List[str] = None 112 ) -> str: 113 """ 114 Runs this process on one or more input datasets, returns the ID of the newly created dataset. 115 116 Args: 117 name (str): Name of newly created dataset 118 project_id (str): ID of the project to run the analysis in 119 datasets (List[DataPortalDataset or str]): One or more input datasets 120 (as DataPortalDataset objects or dataset ID strings) 121 description (str): Description of newly created dataset 122 params (dict): Analysis parameters 123 notifications_emails (List[str]): Notification email address(es) 124 compute_environment (str): Name or ID of compute environment to use, 125 if blank it will run in AWS 126 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 127 It will attempt to re-use the previous output to minimize duplicate work. 128 Note that Nextflow does not require this parameter, as it will automatically resume 129 from any previous attempts using a global cache. 130 source_sample_ids (List[str]): List of sample IDs to use as input for the analysis. 131 132 Returns: 133 dataset_id (str): ID of newly created dataset 134 """ 135 if name is None: 136 raise DataPortalInputError("Must specify 'name' for run_analysis") 137 if project_id is None: 138 raise DataPortalInputError("Must specify 'project_id' for run_analysis") 139 if not datasets: 140 raise DataPortalInputError("Must specify 'datasets' for run_analysis") 141 if notifications_emails is None: 142 notifications_emails = [] 143 if params is None: 144 params = {} 145 146 # Accept DataPortalDataset objects or raw ID strings 147 source_dataset_ids = [ 148 ds if isinstance(ds, str) else ds.id 149 for ds in datasets 150 ] 151 152 if compute_environment: 153 compute_environment_name = compute_environment 154 compute_environments = self._client.compute_environments.list_environments_for_project( 155 project_id=project_id 156 ) 157 compute_environment = next( 158 (env for env in compute_environments 159 if env.name == compute_environment or env.id == compute_environment), 160 None 161 ) 162 if compute_environment is None: 163 raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") 164 165 resp = self._client.execution.run_analysis( 166 project_id=project_id, 167 request=RunAnalysisRequest( 168 name=name, 169 description=description, 170 process_id=self.id, 171 source_dataset_ids=source_dataset_ids, 172 params=RunAnalysisRequestParams.from_dict(params), 173 notification_emails=notifications_emails, 174 resume_dataset_id=resume_dataset_id, 175 source_sample_ids=source_sample_ids, 176 compute_environment_id=compute_environment.id if compute_environment else None 177 ) 178 ) 179 return resp.id
Helper functions for interacting with analysis processes.
16 def __init__(self, process: Union[Process, ProcessDetail], client: CirroApi): 17 """ 18 Instantiate with helper method 19 20 ```python 21 from cirro import DataPortal() 22 portal = DataPortal() 23 process = portal.get_process_by_name("Process Name") 24 ``` 25 """ 26 self._data = process 27 self._client = client
Instantiate with helper method
from cirro import DataPortal()
portal = DataPortal()
process = portal.get_process_by_name("Process Name")
39 @property 40 def description(self) -> str: 41 """Longer description of process""" 42 return self._data.description
Longer description of process
44 @property 45 def child_process_ids(self) -> List[str]: 46 """List of processes which can be run on the output of this process""" 47 return self._data.child_process_ids
List of processes which can be run on the output of this process
49 @property 50 def executor(self) -> Executor: 51 """INGEST, CROMWELL, or NEXTFLOW""" 52 return self._data.executor
INGEST, CROMWELL, or NEXTFLOW
54 @property 55 def category(self) -> str: 56 """Category of process""" 57 return self._data.category
Category of process
59 @property 60 def pipeline_type(self) -> str: 61 """Pipeline type""" 62 return self._data.pipeline_type
Pipeline type
64 @property 65 def documentation_url(self) -> str: 66 """Documentation URL""" 67 return self._data.documentation_url
Documentation URL
69 @property 70 def file_requirements_message(self) -> str: 71 """Description of files required for INGEST processes""" 72 return self._data.file_requirements_message
Description of files required for INGEST processes
74 @property 75 def code(self) -> PipelineCode: 76 """Pipeline code configuration""" 77 return self._get_detail().pipeline_code
Pipeline code configuration
79 @property 80 def custom_settings(self) -> CustomPipelineSettings: 81 """Custom settings for the process""" 82 return self._get_detail().custom_settings
Custom settings for the process
95 def get_parameter_spec(self) -> ParameterSpecification: 96 """ 97 Gets a specification used to describe the parameters used in the process. 98 """ 99 return self._client.processes.get_parameter_spec(self.id)
Gets a specification used to describe the parameters used in the process.
101 def run_analysis( 102 self, 103 name: str = None, 104 project_id: str = None, 105 datasets: list = None, 106 description: str = "", 107 params=None, 108 notifications_emails: List[str] = None, 109 compute_environment: str = None, 110 resume_dataset_id: str = None, 111 source_sample_ids: List[str] = None 112 ) -> str: 113 """ 114 Runs this process on one or more input datasets, returns the ID of the newly created dataset. 115 116 Args: 117 name (str): Name of newly created dataset 118 project_id (str): ID of the project to run the analysis in 119 datasets (List[DataPortalDataset or str]): One or more input datasets 120 (as DataPortalDataset objects or dataset ID strings) 121 description (str): Description of newly created dataset 122 params (dict): Analysis parameters 123 notifications_emails (List[str]): Notification email address(es) 124 compute_environment (str): Name or ID of compute environment to use, 125 if blank it will run in AWS 126 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 127 It will attempt to re-use the previous output to minimize duplicate work. 128 Note that Nextflow does not require this parameter, as it will automatically resume 129 from any previous attempts using a global cache. 130 source_sample_ids (List[str]): List of sample IDs to use as input for the analysis. 131 132 Returns: 133 dataset_id (str): ID of newly created dataset 134 """ 135 if name is None: 136 raise DataPortalInputError("Must specify 'name' for run_analysis") 137 if project_id is None: 138 raise DataPortalInputError("Must specify 'project_id' for run_analysis") 139 if not datasets: 140 raise DataPortalInputError("Must specify 'datasets' for run_analysis") 141 if notifications_emails is None: 142 notifications_emails = [] 143 if params is None: 144 params = {} 145 146 # Accept DataPortalDataset objects or raw ID strings 147 source_dataset_ids = [ 148 ds if isinstance(ds, str) else ds.id 149 for ds in datasets 150 ] 151 152 if compute_environment: 153 compute_environment_name = compute_environment 154 compute_environments = self._client.compute_environments.list_environments_for_project( 155 project_id=project_id 156 ) 157 compute_environment = next( 158 (env for env in compute_environments 159 if env.name == compute_environment or env.id == compute_environment), 160 None 161 ) 162 if compute_environment is None: 163 raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") 164 165 resp = self._client.execution.run_analysis( 166 project_id=project_id, 167 request=RunAnalysisRequest( 168 name=name, 169 description=description, 170 process_id=self.id, 171 source_dataset_ids=source_dataset_ids, 172 params=RunAnalysisRequestParams.from_dict(params), 173 notification_emails=notifications_emails, 174 resume_dataset_id=resume_dataset_id, 175 source_sample_ids=source_sample_ids, 176 compute_environment_id=compute_environment.id if compute_environment else None 177 ) 178 ) 179 return resp.id
Runs this process on one or more input datasets, returns the ID of the newly created dataset.
Arguments:
- name (str): Name of newly created dataset
- project_id (str): ID of the project to run the analysis in
- datasets (List[DataPortalDataset or str]): One or more input datasets (as DataPortalDataset objects or dataset ID strings)
- description (str): Description of newly created dataset
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
- compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
- resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work.
- Note that Nextflow does not require this parameter, as it will automatically resume from any previous attempts using a global cache.
- source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
Returns:
dataset_id (str): ID of newly created dataset
110class DataPortalDataset(DataPortalAsset): 111 """ 112 Datasets in the Data Portal are collections of files which have 113 either been uploaded directly, or which have been output by 114 an analysis pipeline or notebook. 115 """ 116 117 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 118 """ 119 Instantiate a dataset object 120 121 Should be invoked from a top-level constructor, for example: 122 123 ```python 124 from cirro import DataPortal 125 portal = DataPortal() 126 dataset = portal.get_dataset( 127 project="id-or-name-of-project", 128 dataset="id-or-name-of-dataset" 129 ) 130 ``` 131 132 """ 133 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 134 self._data = dataset 135 self._assets: Optional[DatasetAssets] = None 136 self._client = client 137 138 @property 139 def id(self) -> str: 140 """Unique identifier for the dataset""" 141 return self._data.id 142 143 @property 144 def name(self) -> str: 145 """Editable name for the dataset""" 146 return self._data.name 147 148 @property 149 def description(self) -> str: 150 """Longer name for the dataset""" 151 return self._data.description 152 153 @property 154 def process_id(self) -> str: 155 """Unique ID of process used to create the dataset""" 156 return self._data.process_id 157 158 @property 159 def process(self) -> ProcessDetail: 160 """ 161 Object representing the process used to create the dataset 162 """ 163 return self._client.processes.get(self.process_id) 164 165 @property 166 def project_id(self) -> str: 167 """ID of the project containing the dataset""" 168 return self._data.project_id 169 170 @property 171 def status(self) -> Status: 172 """ 173 Status of the dataset 174 """ 175 return self._data.status 176 177 @property 178 def source_dataset_ids(self) -> List[str]: 179 """IDs of the datasets used as sources for this dataset (if any)""" 180 return self._data.source_dataset_ids 181 182 @property 183 def source_datasets(self) -> List['DataPortalDataset']: 184 """ 185 Objects representing the datasets used as sources for this dataset (if any) 186 """ 187 return [ 188 DataPortalDataset( 189 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 190 client=self._client 191 ) 192 for dataset_id in self.source_dataset_ids 193 ] 194 195 @property 196 def params(self) -> dict: 197 """ 198 Parameters used to generate the dataset 199 """ 200 return self._get_detail().params.to_dict() 201 202 @property 203 def info(self) -> dict: 204 """ 205 Extra information about the dataset 206 """ 207 return self._get_detail().info.to_dict() 208 209 @property 210 def tags(self) -> List[Tag]: 211 """ 212 Tags applied to the dataset 213 """ 214 return self._data.tags 215 216 @property 217 def share(self) -> Optional[NamedItem]: 218 """ 219 Share associated with the dataset, if any. 220 """ 221 return self._get_detail().share 222 223 @property 224 def created_by(self) -> str: 225 """User who created the dataset""" 226 return self._data.created_by 227 228 @property 229 def created_at(self) -> datetime.datetime: 230 """Timestamp of dataset creation""" 231 return self._data.created_at 232 233 def _get_detail(self): 234 if not isinstance(self._data, DatasetDetail): 235 self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id) 236 return self._data 237 238 def _get_assets(self): 239 if not self._assets: 240 self._assets = self._client.datasets.get_assets_listing( 241 project_id=self.project_id, 242 dataset_id=self.id 243 ) 244 return self._assets 245 246 def __str__(self): 247 return '\n'.join([ 248 f"{i.title()}: {self.__getattribute__(i)}" 249 for i in ['name', 'id', 'description', 'status'] 250 ]) 251 252 def get_file(self, relative_path: str) -> DataPortalFile: 253 """ 254 Get a file from the dataset using its relative path. 255 256 Args: 257 relative_path (str): Relative path of file within the dataset 258 259 Returns: 260 `from cirro.sdk.file import DataPortalFile` 261 """ 262 263 # Get the list of files in this dataset 264 files = self.list_files() 265 266 # Try getting the file using the relative path provided by the user 267 try: 268 return files.get_by_id(relative_path) 269 except DataPortalAssetNotFound: 270 # Try getting the file with the 'data/' prefix prepended 271 try: 272 return files.get_by_id("data/" + relative_path) 273 except DataPortalAssetNotFound: 274 # If not found, raise the exception using the string provided 275 # by the user, not the data/ prepended version (which may be 276 # confusing to the user) 277 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 278 raise DataPortalAssetNotFound(msg) 279 280 def list_files(self, file_limit: int = 100000) -> DataPortalFiles: 281 """ 282 Return the list of files which make up the dataset. 283 284 Args: 285 file_limit (int): Maximum number of files to return (default 100,000) 286 """ 287 assets = self._client.datasets.get_assets_listing( 288 project_id=self.project_id, 289 dataset_id=self.id, 290 file_limit=file_limit 291 ) 292 files = assets.files 293 294 return DataPortalFiles( 295 [ 296 DataPortalFile(file=file, client=self._client) 297 for file in files 298 ] 299 ) 300 301 def read_files( 302 self, 303 glob: str = None, 304 pattern: str = None, 305 filetype: str = None, 306 **kwargs 307 ): 308 """ 309 Read the contents of files in the dataset. 310 311 See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details 312 on ``glob``/``pattern`` matching and filetype options. 313 314 Args: 315 glob (str): Wildcard expression to match files. 316 Yields one item per matching file: the parsed content. 317 pattern (str): Wildcard expression with ``{name}`` capture 318 placeholders. Yields ``(content, meta)`` per matching file. 319 filetype (str): File format used to parse each file 320 (or ``None`` to infer from extension). 321 **kwargs: Additional keyword arguments forwarded to the 322 file-parsing function. 323 324 Yields: 325 - When using ``glob``: *content* for each matching file 326 - When using ``pattern``: ``(content, meta)`` for each matching file 327 """ 328 if glob is not None and pattern is not None: 329 raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") 330 if glob is None and pattern is None: 331 raise DataPortalInputError("Must specify either 'glob' or 'pattern'") 332 333 if glob is not None: 334 for file in filter_files_by_pattern(list(self.list_files()), glob): 335 yield _read_file_with_format(file, filetype, **kwargs) 336 else: 337 compiled_regex, _ = _pattern_to_captures_regex(pattern) 338 for file in self.list_files(): 339 m = compiled_regex.match(file.relative_path) 340 if m is not None: 341 yield _read_file_with_format(file, filetype, **kwargs), m.groupdict() 342 343 def read_file( 344 self, 345 path: str = None, 346 glob: str = None, 347 filetype: str = None, 348 **kwargs 349 ) -> Any: 350 """ 351 Read the contents of a single file from the dataset. 352 353 See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details. 354 355 Args: 356 path (str): Exact relative path of the file within the dataset. 357 glob (str): Wildcard expression matching exactly one file. 358 filetype (str): File format used to parse the file. Supported values 359 are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`. 360 **kwargs: Additional keyword arguments forwarded to the file-parsing 361 function. 362 363 Returns: 364 Parsed file content. 365 """ 366 if path is not None and glob is not None: 367 raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other") 368 if path is None and glob is None: 369 raise DataPortalInputError("Must specify either 'path' or 'glob'") 370 371 if path is not None: 372 file = self.get_file(path) 373 else: 374 matches = list(filter_files_by_pattern(list(self.list_files()), glob)) 375 if len(matches) == 0: 376 raise DataPortalAssetNotFound(f"No files matched glob '{glob}'") 377 if len(matches) > 1: 378 raise DataPortalInputError( 379 f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files" 380 ) 381 file = matches[0] 382 383 return _read_file_with_format(file, filetype, **kwargs) 384 385 def get_trace(self) -> Any: 386 """ 387 Read the Nextflow workflow trace file for this dataset as a DataFrame. 388 389 Returns: 390 `pandas.DataFrame` 391 """ 392 return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t') 393 394 def get_logs(self) -> str: 395 """ 396 Read the Nextflow workflow logs for this dataset as a string. 397 398 Returns: 399 str 400 """ 401 return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read() 402 403 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 404 """ 405 Get the artifact of a particular type from the dataset 406 """ 407 artifacts = self._get_assets().artifacts 408 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 409 if artifact is None: 410 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 411 return DataPortalFile(file=artifact.file, client=self._client) 412 413 def list_artifacts(self) -> List[DataPortalFile]: 414 """ 415 Return the list of artifacts associated with the dataset 416 417 An artifact may be something generated as part of the analysis or other process. 418 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 419 420 """ 421 artifacts = self._get_assets().artifacts 422 return DataPortalFiles( 423 [ 424 DataPortalFile(file=artifact.file, client=self._client) 425 for artifact in artifacts 426 ] 427 ) 428 429 def download_files(self, download_location: str = None, glob: str = None) -> None: 430 """ 431 Download all the files from the dataset to a local directory. 432 433 Args: 434 download_location (str): Path to local directory 435 glob (str): Optional wildcard expression to filter which files are downloaded 436 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 437 If omitted, all files are downloaded. 438 """ 439 440 files = self.list_files() 441 if glob is not None: 442 files = DataPortalFiles(filter_files_by_pattern(list(files), glob)) 443 files.download(download_location) 444 445 def run_analysis( 446 self, 447 name: str = None, 448 description: str = "", 449 process: Union[DataPortalProcess, str] = None, 450 params=None, 451 notifications_emails: List[str] = None, 452 compute_environment: str = None, 453 resume_dataset_id: str = None, 454 source_sample_ids: List[str] = None 455 ) -> str: 456 """ 457 Runs an analysis on a dataset, returns the ID of the newly created dataset. 458 459 The process can be provided as either a DataPortalProcess object, 460 or a string which corresponds to the name or ID of the process. 461 462 Args: 463 name (str): Name of newly created dataset 464 description (str): Description of newly created dataset 465 process (DataPortalProcess or str): Process to run 466 params (dict): Analysis parameters 467 notifications_emails (List[str]): Notification email address(es) 468 compute_environment (str): Name or ID of compute environment to use, 469 if blank it will run in AWS 470 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 471 It will attempt to re-use the previous output to minimize duplicate work 472 source_sample_ids (List[str]): List of sample IDs to use as input for the analysis. 473 474 Returns: 475 dataset_id (str): ID of newly created dataset 476 """ 477 if name is None: 478 raise DataPortalInputError("Must specify 'name' for run_analysis") 479 if process is None: 480 raise DataPortalInputError("Must specify 'process' for run_analysis") 481 if notifications_emails is None: 482 notifications_emails = [] 483 if params is None: 484 params = {} 485 486 # If the process is a string, try to parse it as a process name or ID 487 process = parse_process_name_or_id(process, self._client) 488 489 if compute_environment: 490 compute_environment_name = compute_environment 491 compute_environments = self._client.compute_environments.list_environments_for_project( 492 project_id=self.project_id 493 ) 494 compute_environment = next( 495 (env for env in compute_environments 496 if env.name == compute_environment or env.id == compute_environment), 497 None 498 ) 499 if compute_environment is None: 500 raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") 501 502 resp = self._client.execution.run_analysis( 503 project_id=self.project_id, 504 request=RunAnalysisRequest( 505 name=name, 506 description=description, 507 process_id=process.id, 508 source_dataset_ids=[self.id], 509 params=RunAnalysisRequestParams.from_dict(params), 510 notification_emails=notifications_emails, 511 resume_dataset_id=resume_dataset_id, 512 source_sample_ids=source_sample_ids, 513 compute_environment_id=compute_environment.id if compute_environment else None 514 ) 515 ) 516 return resp.id 517 518 def update_samplesheet(self, 519 contents: str = None, 520 file_path: PathLike = None): 521 """ 522 Updates the samplesheet metadata of a dataset. 523 Provide either the contents (as a string) or a file path. 524 Both must be in the format of a CSV. 525 526 Args: 527 contents (str): Samplesheet contents to update (should be a CSV string) 528 file_path (PathLike): Path of file to update (should be a CSV file) 529 530 Example: 531 ```python 532 dataset.update_samplesheet( 533 file_path=Path('~/samplesheet.csv') 534 ) 535 ``` 536 """ 537 538 if contents is None and file_path is None: 539 raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet") 540 541 samplesheet_contents = contents 542 if file_path is not None: 543 samplesheet_contents = Path(file_path).expanduser().read_text() 544 545 # Validate samplesheet 546 file_names = [f.file_name for f in self.list_files()] 547 request = ValidateFileRequirementsRequest( 548 file_names=file_names, 549 sample_sheet=samplesheet_contents, 550 ) 551 requirements = validate_file_requirements.sync(process_id=self.process_id, 552 body=request, 553 client=self._client.api_client) 554 if error_msg := requirements.error_msg: 555 raise DataPortalInputError(error_msg) 556 557 # Update the samplesheet if everything looks ok 558 self._client.datasets.update_samplesheet( 559 project_id=self.project_id, 560 dataset_id=self.id, 561 samplesheet=samplesheet_contents 562 )
Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.
117 def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): 118 """ 119 Instantiate a dataset object 120 121 Should be invoked from a top-level constructor, for example: 122 123 ```python 124 from cirro import DataPortal 125 portal = DataPortal() 126 dataset = portal.get_dataset( 127 project="id-or-name-of-project", 128 dataset="id-or-name-of-dataset" 129 ) 130 ``` 131 132 """ 133 assert dataset.project_id is not None, "Must provide dataset with project_id attribute" 134 self._data = dataset 135 self._assets: Optional[DatasetAssets] = None 136 self._client = client
Instantiate a dataset object
Should be invoked from a top-level constructor, for example:
from cirro import DataPortal
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
dataset="id-or-name-of-dataset"
)
138 @property 139 def id(self) -> str: 140 """Unique identifier for the dataset""" 141 return self._data.id
Unique identifier for the dataset
143 @property 144 def name(self) -> str: 145 """Editable name for the dataset""" 146 return self._data.name
Editable name for the dataset
148 @property 149 def description(self) -> str: 150 """Longer name for the dataset""" 151 return self._data.description
Longer name for the dataset
153 @property 154 def process_id(self) -> str: 155 """Unique ID of process used to create the dataset""" 156 return self._data.process_id
Unique ID of process used to create the dataset
158 @property 159 def process(self) -> ProcessDetail: 160 """ 161 Object representing the process used to create the dataset 162 """ 163 return self._client.processes.get(self.process_id)
Object representing the process used to create the dataset
165 @property 166 def project_id(self) -> str: 167 """ID of the project containing the dataset""" 168 return self._data.project_id
ID of the project containing the dataset
170 @property 171 def status(self) -> Status: 172 """ 173 Status of the dataset 174 """ 175 return self._data.status
Status of the dataset
177 @property 178 def source_dataset_ids(self) -> List[str]: 179 """IDs of the datasets used as sources for this dataset (if any)""" 180 return self._data.source_dataset_ids
IDs of the datasets used as sources for this dataset (if any)
182 @property 183 def source_datasets(self) -> List['DataPortalDataset']: 184 """ 185 Objects representing the datasets used as sources for this dataset (if any) 186 """ 187 return [ 188 DataPortalDataset( 189 dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id), 190 client=self._client 191 ) 192 for dataset_id in self.source_dataset_ids 193 ]
Objects representing the datasets used as sources for this dataset (if any)
195 @property 196 def params(self) -> dict: 197 """ 198 Parameters used to generate the dataset 199 """ 200 return self._get_detail().params.to_dict()
Parameters used to generate the dataset
202 @property 203 def info(self) -> dict: 204 """ 205 Extra information about the dataset 206 """ 207 return self._get_detail().info.to_dict()
Extra information about the dataset
223 @property 224 def created_by(self) -> str: 225 """User who created the dataset""" 226 return self._data.created_by
User who created the dataset
228 @property 229 def created_at(self) -> datetime.datetime: 230 """Timestamp of dataset creation""" 231 return self._data.created_at
Timestamp of dataset creation
252 def get_file(self, relative_path: str) -> DataPortalFile: 253 """ 254 Get a file from the dataset using its relative path. 255 256 Args: 257 relative_path (str): Relative path of file within the dataset 258 259 Returns: 260 `from cirro.sdk.file import DataPortalFile` 261 """ 262 263 # Get the list of files in this dataset 264 files = self.list_files() 265 266 # Try getting the file using the relative path provided by the user 267 try: 268 return files.get_by_id(relative_path) 269 except DataPortalAssetNotFound: 270 # Try getting the file with the 'data/' prefix prepended 271 try: 272 return files.get_by_id("data/" + relative_path) 273 except DataPortalAssetNotFound: 274 # If not found, raise the exception using the string provided 275 # by the user, not the data/ prepended version (which may be 276 # confusing to the user) 277 msg = '\n'.join([f"No file found with path '{relative_path}'."]) 278 raise DataPortalAssetNotFound(msg)
Get a file from the dataset using its relative path.
Arguments:
- relative_path (str): Relative path of file within the dataset
Returns:
from cirro.sdk.file import DataPortalFile
280 def list_files(self, file_limit: int = 100000) -> DataPortalFiles: 281 """ 282 Return the list of files which make up the dataset. 283 284 Args: 285 file_limit (int): Maximum number of files to return (default 100,000) 286 """ 287 assets = self._client.datasets.get_assets_listing( 288 project_id=self.project_id, 289 dataset_id=self.id, 290 file_limit=file_limit 291 ) 292 files = assets.files 293 294 return DataPortalFiles( 295 [ 296 DataPortalFile(file=file, client=self._client) 297 for file in files 298 ] 299 )
Return the list of files which make up the dataset.
Arguments:
- file_limit (int): Maximum number of files to return (default 100,000)
301 def read_files( 302 self, 303 glob: str = None, 304 pattern: str = None, 305 filetype: str = None, 306 **kwargs 307 ): 308 """ 309 Read the contents of files in the dataset. 310 311 See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details 312 on ``glob``/``pattern`` matching and filetype options. 313 314 Args: 315 glob (str): Wildcard expression to match files. 316 Yields one item per matching file: the parsed content. 317 pattern (str): Wildcard expression with ``{name}`` capture 318 placeholders. Yields ``(content, meta)`` per matching file. 319 filetype (str): File format used to parse each file 320 (or ``None`` to infer from extension). 321 **kwargs: Additional keyword arguments forwarded to the 322 file-parsing function. 323 324 Yields: 325 - When using ``glob``: *content* for each matching file 326 - When using ``pattern``: ``(content, meta)`` for each matching file 327 """ 328 if glob is not None and pattern is not None: 329 raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") 330 if glob is None and pattern is None: 331 raise DataPortalInputError("Must specify either 'glob' or 'pattern'") 332 333 if glob is not None: 334 for file in filter_files_by_pattern(list(self.list_files()), glob): 335 yield _read_file_with_format(file, filetype, **kwargs) 336 else: 337 compiled_regex, _ = _pattern_to_captures_regex(pattern) 338 for file in self.list_files(): 339 m = compiled_regex.match(file.relative_path) 340 if m is not None: 341 yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()
Read the contents of files in the dataset.
See ~cirro.sdk.portal.DataPortal.read_files() for full details
on glob/pattern matching and filetype options.
Arguments:
- glob (str): Wildcard expression to match files. Yields one item per matching file: the parsed content.
- pattern (str): Wildcard expression with
{name}capture placeholders. Yields(content, meta)per matching file. - filetype (str): File format used to parse each file
(or
Noneto infer from extension). - **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Yields:
- When using
glob: content for each matching file- When using
pattern:(content, meta)for each matching file
343 def read_file( 344 self, 345 path: str = None, 346 glob: str = None, 347 filetype: str = None, 348 **kwargs 349 ) -> Any: 350 """ 351 Read the contents of a single file from the dataset. 352 353 See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details. 354 355 Args: 356 path (str): Exact relative path of the file within the dataset. 357 glob (str): Wildcard expression matching exactly one file. 358 filetype (str): File format used to parse the file. Supported values 359 are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`. 360 **kwargs: Additional keyword arguments forwarded to the file-parsing 361 function. 362 363 Returns: 364 Parsed file content. 365 """ 366 if path is not None and glob is not None: 367 raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other") 368 if path is None and glob is None: 369 raise DataPortalInputError("Must specify either 'path' or 'glob'") 370 371 if path is not None: 372 file = self.get_file(path) 373 else: 374 matches = list(filter_files_by_pattern(list(self.list_files()), glob)) 375 if len(matches) == 0: 376 raise DataPortalAssetNotFound(f"No files matched glob '{glob}'") 377 if len(matches) > 1: 378 raise DataPortalInputError( 379 f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files" 380 ) 381 file = matches[0] 382 383 return _read_file_with_format(file, filetype, **kwargs)
Read the contents of a single file from the dataset.
See ~cirro.sdk.portal.DataPortal.read_file() for full details.
Arguments:
- path (str): Exact relative path of the file within the dataset.
- glob (str): Wildcard expression matching exactly one file.
- filetype (str): File format used to parse the file. Supported values
are the same as
~cirro.sdk.portal.DataPortal.read_files(). - **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Returns:
Parsed file content.
385 def get_trace(self) -> Any: 386 """ 387 Read the Nextflow workflow trace file for this dataset as a DataFrame. 388 389 Returns: 390 `pandas.DataFrame` 391 """ 392 return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')
Read the Nextflow workflow trace file for this dataset as a DataFrame.
Returns:
pandas.DataFrame
394 def get_logs(self) -> str: 395 """ 396 Read the Nextflow workflow logs for this dataset as a string. 397 398 Returns: 399 str 400 """ 401 return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()
Read the Nextflow workflow logs for this dataset as a string.
Returns:
str
403 def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: 404 """ 405 Get the artifact of a particular type from the dataset 406 """ 407 artifacts = self._get_assets().artifacts 408 artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None) 409 if artifact is None: 410 raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'") 411 return DataPortalFile(file=artifact.file, client=self._client)
Get the artifact of a particular type from the dataset
413 def list_artifacts(self) -> List[DataPortalFile]: 414 """ 415 Return the list of artifacts associated with the dataset 416 417 An artifact may be something generated as part of the analysis or other process. 418 See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types. 419 420 """ 421 artifacts = self._get_assets().artifacts 422 return DataPortalFiles( 423 [ 424 DataPortalFile(file=artifact.file, client=self._client) 425 for artifact in artifacts 426 ] 427 )
Return the list of artifacts associated with the dataset
An artifact may be something generated as part of the analysis or other process.
See cirro_api_client.v1.models.ArtifactType for the list of possible artifact types.
429 def download_files(self, download_location: str = None, glob: str = None) -> None: 430 """ 431 Download all the files from the dataset to a local directory. 432 433 Args: 434 download_location (str): Path to local directory 435 glob (str): Optional wildcard expression to filter which files are downloaded 436 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 437 If omitted, all files are downloaded. 438 """ 439 440 files = self.list_files() 441 if glob is not None: 442 files = DataPortalFiles(filter_files_by_pattern(list(files), glob)) 443 files.download(download_location)
Download all the files from the dataset to a local directory.
Arguments:
- download_location (str): Path to local directory
- glob (str): Optional wildcard expression to filter which files are downloaded
(e.g.,
'*.csv','data/**/*.tsv.gz'). If omitted, all files are downloaded.
445 def run_analysis( 446 self, 447 name: str = None, 448 description: str = "", 449 process: Union[DataPortalProcess, str] = None, 450 params=None, 451 notifications_emails: List[str] = None, 452 compute_environment: str = None, 453 resume_dataset_id: str = None, 454 source_sample_ids: List[str] = None 455 ) -> str: 456 """ 457 Runs an analysis on a dataset, returns the ID of the newly created dataset. 458 459 The process can be provided as either a DataPortalProcess object, 460 or a string which corresponds to the name or ID of the process. 461 462 Args: 463 name (str): Name of newly created dataset 464 description (str): Description of newly created dataset 465 process (DataPortalProcess or str): Process to run 466 params (dict): Analysis parameters 467 notifications_emails (List[str]): Notification email address(es) 468 compute_environment (str): Name or ID of compute environment to use, 469 if blank it will run in AWS 470 resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. 471 It will attempt to re-use the previous output to minimize duplicate work 472 source_sample_ids (List[str]): List of sample IDs to use as input for the analysis. 473 474 Returns: 475 dataset_id (str): ID of newly created dataset 476 """ 477 if name is None: 478 raise DataPortalInputError("Must specify 'name' for run_analysis") 479 if process is None: 480 raise DataPortalInputError("Must specify 'process' for run_analysis") 481 if notifications_emails is None: 482 notifications_emails = [] 483 if params is None: 484 params = {} 485 486 # If the process is a string, try to parse it as a process name or ID 487 process = parse_process_name_or_id(process, self._client) 488 489 if compute_environment: 490 compute_environment_name = compute_environment 491 compute_environments = self._client.compute_environments.list_environments_for_project( 492 project_id=self.project_id 493 ) 494 compute_environment = next( 495 (env for env in compute_environments 496 if env.name == compute_environment or env.id == compute_environment), 497 None 498 ) 499 if compute_environment is None: 500 raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") 501 502 resp = self._client.execution.run_analysis( 503 project_id=self.project_id, 504 request=RunAnalysisRequest( 505 name=name, 506 description=description, 507 process_id=process.id, 508 source_dataset_ids=[self.id], 509 params=RunAnalysisRequestParams.from_dict(params), 510 notification_emails=notifications_emails, 511 resume_dataset_id=resume_dataset_id, 512 source_sample_ids=source_sample_ids, 513 compute_environment_id=compute_environment.id if compute_environment else None 514 ) 515 ) 516 return resp.id
Runs an analysis on a dataset, returns the ID of the newly created dataset.
The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.
Arguments:
- name (str): Name of newly created dataset
- description (str): Description of newly created dataset
- process (DataPortalProcess or str): Process to run
- params (dict): Analysis parameters
- notifications_emails (List[str]): Notification email address(es)
- compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
- resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
- source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
Returns:
dataset_id (str): ID of newly created dataset
518 def update_samplesheet(self, 519 contents: str = None, 520 file_path: PathLike = None): 521 """ 522 Updates the samplesheet metadata of a dataset. 523 Provide either the contents (as a string) or a file path. 524 Both must be in the format of a CSV. 525 526 Args: 527 contents (str): Samplesheet contents to update (should be a CSV string) 528 file_path (PathLike): Path of file to update (should be a CSV file) 529 530 Example: 531 ```python 532 dataset.update_samplesheet( 533 file_path=Path('~/samplesheet.csv') 534 ) 535 ``` 536 """ 537 538 if contents is None and file_path is None: 539 raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet") 540 541 samplesheet_contents = contents 542 if file_path is not None: 543 samplesheet_contents = Path(file_path).expanduser().read_text() 544 545 # Validate samplesheet 546 file_names = [f.file_name for f in self.list_files()] 547 request = ValidateFileRequirementsRequest( 548 file_names=file_names, 549 sample_sheet=samplesheet_contents, 550 ) 551 requirements = validate_file_requirements.sync(process_id=self.process_id, 552 body=request, 553 client=self._client.api_client) 554 if error_msg := requirements.error_msg: 555 raise DataPortalInputError(error_msg) 556 557 # Update the samplesheet if everything looks ok 558 self._client.datasets.update_samplesheet( 559 project_id=self.project_id, 560 dataset_id=self.id, 561 samplesheet=samplesheet_contents 562 )
Updates the samplesheet metadata of a dataset. Provide either the contents (as a string) or a file path. Both must be in the format of a CSV.
Arguments:
- contents (str): Samplesheet contents to update (should be a CSV string)
- file_path (PathLike): Path of file to update (should be a CSV file)
Example:
dataset.update_samplesheet(
file_path=Path('~/samplesheet.csv')
)
12class DataPortalReference(DataPortalAsset): 13 """ 14 Reference data object containing files which can be used for analysis in a particular project. 15 """ 16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ] 30 31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files 35 36 @property 37 def name(self) -> str: 38 """Reference name""" 39 return self._data.name 40 41 @property 42 def type(self) -> str: 43 """Type of reference data (e.g. genome_fasta)""" 44 return self._data.type_ 45 46 @property 47 def absolute_path(self): 48 if len(self._files) == 0: 49 return None 50 return self._files[0].absolute_path 51 52 def __str__(self): 53 return self.name
Reference data object containing files which can be used for analysis in a particular project.
16 def __init__(self, ref: Reference, project_id: str, client: CirroApi): 17 """ 18 Instantiate by listing the references which have been added to a particular project 19 ```python 20 from cirro import DataPortal() 21 portal = DataPortal() 22 project = portal.get_project_by_name("Project Name") 23 references = project.list_references() 24 ``` 25 """ 26 self._data = ref 27 self._files = [ 28 DataPortalFile(File.from_file_entry(f, project_id), client) for f in ref.files 29 ]
Instantiate by listing the references which have been added to a particular project
from cirro import DataPortal()
portal = DataPortal()
project = portal.get_project_by_name("Project Name")
references = project.list_references()
31 @property 32 def files(self) -> List[DataPortalFile]: 33 """File(s) contained in the reference""" 34 return self._files
File(s) contained in the reference
12class CirroApi: 13 """ 14 Client for interacting directly with the Cirro API 15 """ 16 def __init__(self, auth_info: AuthInfo = None, base_url: str = None, user_agent: str = 'Cirro SDK'): 17 """ 18 Instantiates the Cirro API object 19 20 Args: 21 auth_info (cirro.auth.base.AuthInfo): 22 base_url (str): Optional base URL of the Cirro instance 23 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 24 25 Returns: 26 Authenticated Cirro API object, which can be used to call endpoint functions. 27 28 Example: 29 ```python 30 from cirro.cirro_client import CirroApi 31 32 cirro = CirroApi(base_url="app.cirro.bio") 33 print(cirro.projects.list()) 34 ``` 35 """ 36 37 self._configuration = AppConfig(base_url=base_url) 38 if not auth_info: 39 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 40 41 self._api_client = CirroApiClient( 42 base_url=self._configuration.rest_endpoint, 43 auth_method=auth_info.get_auth_method(), 44 client_name=user_agent, 45 package_name='cirro' 46 ) 47 48 # Init services 49 self._file_service = FileService(self._api_client, 50 checksum_method=self._configuration.checksum_method, 51 transfer_retries=self._configuration.transfer_max_retries) 52 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 53 self._project_service = ProjectService(self._api_client) 54 self._process_service = ProcessService(self._api_client) 55 self._execution_service = ExecutionService(self._api_client) 56 self._compute_environment_service = ComputeEnvironmentService(self._api_client) 57 self._metrics_service = MetricsService(self._api_client) 58 self._metadata_service = MetadataService(self._api_client) 59 self._billing_service = BillingService(self._api_client) 60 self._references_service = ReferenceService(self._api_client, file_service=self._file_service) 61 self._shares_service = ShareService(self._api_client) 62 self._users_service = UserService(self._api_client) 63 self._workspace_service = WorkspaceService(self._api_client) 64 65 @property 66 def datasets(self) -> DatasetService: 67 """ 68 Create, list, delete, and modify Datasets 69 """ 70 return self._dataset_service 71 72 @property 73 def projects(self) -> ProjectService: 74 """ 75 Create, list, delete, and modify Projects 76 """ 77 return self._project_service 78 79 @property 80 def processes(self) -> ProcessService: 81 """ 82 List and retrieve detailed information about Processes 83 """ 84 return self._process_service 85 86 @property 87 def execution(self) -> ExecutionService: 88 """ 89 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 90 """ 91 return self._execution_service 92 93 @property 94 def compute_environments(self) -> ComputeEnvironmentService: 95 """ 96 List and update compute environments 97 """ 98 return self._compute_environment_service 99 100 @property 101 def metrics(self) -> MetricsService: 102 """ 103 Project-level summary metrics 104 """ 105 return self._metrics_service 106 107 @property 108 def metadata(self) -> MetadataService: 109 """ 110 List and modify Sample metadata or metadata schemas 111 """ 112 return self._metadata_service 113 114 @property 115 def billing(self) -> BillingService: 116 """ 117 List and update billing accounts 118 """ 119 return self._billing_service 120 121 @property 122 def references(self) -> ReferenceService: 123 """ 124 List References and Reference types 125 """ 126 return self._references_service 127 128 @property 129 def shares(self) -> ShareService: 130 """ 131 List, create, update, delete, and subscribe to shares 132 """ 133 return self._shares_service 134 135 @property 136 def users(self) -> UserService: 137 """ 138 List and update user information 139 """ 140 return self._users_service 141 142 @property 143 def workspaces(self) -> WorkspaceService: 144 """ 145 Manage workspaces 146 """ 147 return self._workspace_service 148 149 @property 150 def file(self) -> FileService: 151 """ 152 Read, download, and create file objects 153 """ 154 return self._file_service 155 156 @property 157 def api_client(self) -> CirroApiClient: 158 """ 159 Gets the underlying API client 160 """ 161 return self._api_client 162 163 @property 164 def configuration(self) -> AppConfig: 165 """ 166 Gets the configuration of the instance 167 """ 168 return self._configuration
Client for interacting directly with the Cirro API
16 def __init__(self, auth_info: AuthInfo = None, base_url: str = None, user_agent: str = 'Cirro SDK'): 17 """ 18 Instantiates the Cirro API object 19 20 Args: 21 auth_info (cirro.auth.base.AuthInfo): 22 base_url (str): Optional base URL of the Cirro instance 23 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 24 25 Returns: 26 Authenticated Cirro API object, which can be used to call endpoint functions. 27 28 Example: 29 ```python 30 from cirro.cirro_client import CirroApi 31 32 cirro = CirroApi(base_url="app.cirro.bio") 33 print(cirro.projects.list()) 34 ``` 35 """ 36 37 self._configuration = AppConfig(base_url=base_url) 38 if not auth_info: 39 auth_info = get_auth_info_from_config(self._configuration, auth_io=None) 40 41 self._api_client = CirroApiClient( 42 base_url=self._configuration.rest_endpoint, 43 auth_method=auth_info.get_auth_method(), 44 client_name=user_agent, 45 package_name='cirro' 46 ) 47 48 # Init services 49 self._file_service = FileService(self._api_client, 50 checksum_method=self._configuration.checksum_method, 51 transfer_retries=self._configuration.transfer_max_retries) 52 self._dataset_service = DatasetService(self._api_client, file_service=self._file_service) 53 self._project_service = ProjectService(self._api_client) 54 self._process_service = ProcessService(self._api_client) 55 self._execution_service = ExecutionService(self._api_client) 56 self._compute_environment_service = ComputeEnvironmentService(self._api_client) 57 self._metrics_service = MetricsService(self._api_client) 58 self._metadata_service = MetadataService(self._api_client) 59 self._billing_service = BillingService(self._api_client) 60 self._references_service = ReferenceService(self._api_client, file_service=self._file_service) 61 self._shares_service = ShareService(self._api_client) 62 self._users_service = UserService(self._api_client) 63 self._workspace_service = WorkspaceService(self._api_client)
Instantiates the Cirro API object
Arguments:
- auth_info (cirro.auth.base.AuthInfo):
- base_url (str): Optional base URL of the Cirro instance
(if not provided, it uses the
CIRRO_BASE_URLenvironment variable, or the config file)
Returns:
Authenticated Cirro API object, which can be used to call endpoint functions.
Example:
from cirro.cirro_client import CirroApi
cirro = CirroApi(base_url="app.cirro.bio")
print(cirro.projects.list())
65 @property 66 def datasets(self) -> DatasetService: 67 """ 68 Create, list, delete, and modify Datasets 69 """ 70 return self._dataset_service
Create, list, delete, and modify Datasets
72 @property 73 def projects(self) -> ProjectService: 74 """ 75 Create, list, delete, and modify Projects 76 """ 77 return self._project_service
Create, list, delete, and modify Projects
79 @property 80 def processes(self) -> ProcessService: 81 """ 82 List and retrieve detailed information about Processes 83 """ 84 return self._process_service
List and retrieve detailed information about Processes
86 @property 87 def execution(self) -> ExecutionService: 88 """ 89 List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets) 90 """ 91 return self._execution_service
List, run, stop, and describe the analysis jobs (executing Processes to create new Datasets)
93 @property 94 def compute_environments(self) -> ComputeEnvironmentService: 95 """ 96 List and update compute environments 97 """ 98 return self._compute_environment_service
List and update compute environments
100 @property 101 def metrics(self) -> MetricsService: 102 """ 103 Project-level summary metrics 104 """ 105 return self._metrics_service
Project-level summary metrics
107 @property 108 def metadata(self) -> MetadataService: 109 """ 110 List and modify Sample metadata or metadata schemas 111 """ 112 return self._metadata_service
List and modify Sample metadata or metadata schemas
114 @property 115 def billing(self) -> BillingService: 116 """ 117 List and update billing accounts 118 """ 119 return self._billing_service
List and update billing accounts
121 @property 122 def references(self) -> ReferenceService: 123 """ 124 List References and Reference types 125 """ 126 return self._references_service
List References and Reference types
135 @property 136 def users(self) -> UserService: 137 """ 138 List and update user information 139 """ 140 return self._users_service
List and update user information
142 @property 143 def workspaces(self) -> WorkspaceService: 144 """ 145 Manage workspaces 146 """ 147 return self._workspace_service
Manage workspaces
149 @property 150 def file(self) -> FileService: 151 """ 152 Read, download, and create file objects 153 """ 154 return self._file_service
Read, download, and create file objects