cirro.sdk.portal
1from cirro_api_client.v1.models import Executor 2 3from cirro.cirro_client import CirroApi 4from cirro.sdk.dataset import DataPortalDataset 5from cirro.sdk.developer import DeveloperHelper 6from cirro.sdk.exceptions import DataPortalAssetNotFound 7from cirro.sdk.process import DataPortalProcess, DataPortalProcesses 8from cirro.sdk.project import DataPortalProject, DataPortalProjects 9from cirro.sdk.reference_type import DataPortalReferenceType, DataPortalReferenceTypes 10 11 12class DataPortal: 13 """ 14 Helper functions for exploring the Projects, Datasets, Samples, and Files 15 available in the Data Portal. 16 """ 17 18 def __init__(self, base_url: str = None, client: CirroApi = None): 19 """ 20 Set up the DataPortal object, establishing an authenticated connection. 21 22 Args: 23 base_url (str): Optional base URL of the Cirro instance 24 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 25 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 26 27 Example: 28 ```python 29 from cirro import DataPortal 30 31 portal = DataPortal(base_url="app.cirro.bio") 32 portal.list_projects() 33 ``` 34 """ 35 36 if client is not None: 37 self._client = client 38 39 # Set up default client if not provided 40 else: 41 self._client = CirroApi(base_url=base_url) 42 43 def list_projects(self) -> DataPortalProjects: 44 """List all the projects available in the Data Portal.""" 45 46 return DataPortalProjects( 47 [ 48 DataPortalProject(proj, self._client) 49 for proj in self._client.projects.list() 50 ] 51 ) 52 53 def get_project_by_name(self, name: str = None) -> DataPortalProject: 54 """Return the project with the specified name.""" 55 56 return self.list_projects().get_by_name(name) 57 58 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 59 """Return the project with the specified id.""" 60 61 return self.list_projects().get_by_id(_id) 62 63 def get_project(self, project: str = None) -> DataPortalProject: 64 """ 65 Return a project identified by ID or name. 66 67 Args: 68 project (str): ID or name of project 69 70 Returns: 71 `from cirro.sdk.project import DataPortalProject` 72 """ 73 try: 74 return self.get_project_by_id(project) 75 except DataPortalAssetNotFound: 76 return self.get_project_by_name(project) 77 78 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 79 """ 80 Return a dataset identified by ID or name. 81 82 Args: 83 project (str): ID or name of project 84 dataset (str): ID or name of dataset 85 86 Returns: 87 `cirro.sdk.dataset.DataPortalDataset` 88 89 ```python 90 from cirro import DataPortal() 91 portal = DataPortal() 92 dataset = portal.get_dataset( 93 project="id-or-name-of-project", 94 dataset="id-or-name-of-dataset" 95 ) 96 ``` 97 """ 98 try: 99 project: DataPortalProject = self.get_project_by_id(project) 100 except DataPortalAssetNotFound: 101 project: DataPortalProject = self.get_project_by_name(project) 102 103 return project.get_dataset(dataset) 104 105 def read_files( 106 self, 107 project: str, 108 dataset: str, 109 glob: str = None, 110 pattern: str = None, 111 filetype: str = None, 112 **kwargs 113 ): 114 """ 115 Read the contents of files from a dataset. 116 117 The project and dataset can each be identified by name or ID. 118 Exactly one of ``glob`` or ``pattern`` must be provided. 119 120 **glob** — standard wildcard matching; yields the file content for each 121 matching file: 122 123 - ``*`` matches any characters within a single path segment 124 - ``**`` matches zero or more path segments 125 - Matching is suffix-anchored (``*.csv`` matches at any depth) 126 127 **pattern** — like ``glob`` but ``{name}`` placeholders capture portions 128 of the path automatically; yields ``(content, meta)`` pairs where 129 *meta* is a ``dict`` of extracted values: 130 131 - ``{name}`` captures one path segment (no ``/``) 132 - ``*`` and ``**`` wildcards work as in ``glob`` 133 134 Args: 135 project (str): ID or name of the project. 136 dataset (str): ID or name of the dataset. 137 glob (str): Wildcard expression to match files 138 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 139 Yields one item per matching file: the parsed content. 140 pattern (str): Wildcard expression with ``{name}`` capture 141 placeholders (e.g., ``'{sample}.csv'``, 142 ``'{condition}/{sample}.csv'``). 143 Yields ``(content, meta)`` per matching file. 144 filetype (str): File format used to parse each file. Supported values: 145 146 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` 147 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) 148 - ``'json'``: parse with :func:`json.loads`, returns a Python object 149 - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` 150 (requires ``pyarrow`` or ``fastparquet``) 151 - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` 152 (requires ``pyarrow``) 153 - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object 154 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` 155 (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) 156 - ``'text'``: read as plain text, returns a ``str`` 157 - ``'bytes'``: read as raw bytes, returns ``bytes`` 158 - ``None`` (default): infer from file extension 159 (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, 160 ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, 161 ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, 162 ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) 163 **kwargs: Additional keyword arguments forwarded to the file-parsing 164 function (e.g., ``sep='\\t'`` for CSV/TSV files). 165 166 Yields: 167 - When using ``glob``: *content* for each matching file 168 - When using ``pattern``: ``(content, meta)`` for each matching file, 169 where *meta* is a ``dict`` of values extracted from ``{name}`` 170 placeholders 171 172 Raises: 173 DataPortalInputError: if both ``glob`` and ``pattern`` are provided, 174 or if neither is provided. 175 176 Example: 177 ```python 178 # Read all CSV files — just the content 179 for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): 180 print(df.shape) 181 182 # Extract sample names from filenames automatically 183 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): 184 print(meta['sample'], df.shape) 185 186 # Multi-level capture: condition directory + sample filename 187 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): 188 print(meta['condition'], meta['sample'], df.shape) 189 190 # Read gzip-compressed TSV files with explicit separator 191 for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): 192 print(df.shape) 193 ``` 194 """ 195 ds = self.get_dataset(project=project, dataset=dataset) 196 yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs) 197 198 def read_file( 199 self, 200 project: str, 201 dataset: str, 202 path: str = None, 203 glob: str = None, 204 filetype: str = None, 205 **kwargs 206 ): 207 """ 208 Read the contents of a single file from a dataset. 209 210 The project and dataset can each be identified by name or ID. 211 Provide either ``path`` (exact relative path) or ``glob`` (wildcard 212 expression). If ``glob`` is used it must match exactly one file. 213 214 Args: 215 project (str): ID or name of the project. 216 dataset (str): ID or name of the dataset. 217 path (str): Exact relative path of the file within the dataset. 218 glob (str): Wildcard expression matching exactly one file. 219 filetype (str): File format used to parse the file. Supported values 220 are the same as :meth:`read_files`. 221 **kwargs: Additional keyword arguments forwarded to the 222 file-parsing function. 223 224 Returns: 225 Parsed file content. 226 227 Raises: 228 DataPortalInputError: if both or neither of ``path``/``glob`` are 229 provided, or if ``glob`` matches zero or more than one file. 230 """ 231 ds = self.get_dataset(project=project, dataset=dataset) 232 return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs) 233 234 def list_processes(self, ingest=False) -> DataPortalProcesses: 235 """ 236 List all the processes available in the Data Portal. 237 By default, only list non-ingest processes (those which can be run on existing datasets). 238 To list the processes which can be used to upload datasets, use `ingest = True`. 239 240 Args: 241 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 242 """ 243 244 return DataPortalProcesses( 245 [ 246 DataPortalProcess(p, self._client) 247 for p in self._client.processes.list() 248 if not ingest or p.executor == Executor.INGEST 249 ] 250 ) 251 252 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 253 """ 254 Return the process with the specified name. 255 256 Args: 257 name (str): Name of process 258 """ 259 260 return self.list_processes(ingest=ingest).get_by_name(name) 261 262 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 263 """ 264 Return the process with the specified id 265 266 Args: 267 id (str): ID of process 268 """ 269 270 return self.list_processes(ingest=ingest).get_by_id(id) 271 272 def list_reference_types(self) -> DataPortalReferenceTypes: 273 """ 274 Return the list of all available reference types 275 """ 276 277 return DataPortalReferenceTypes( 278 [ 279 DataPortalReferenceType(ref) 280 for ref in self._client.references.get_types() 281 ] 282 ) 283 284 @property 285 def developer_helper(self) -> DeveloperHelper: 286 return DeveloperHelper(self._client)
13class DataPortal: 14 """ 15 Helper functions for exploring the Projects, Datasets, Samples, and Files 16 available in the Data Portal. 17 """ 18 19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url) 43 44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 ) 53 54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name) 58 59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id) 63 64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project) 78 79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 return project.get_dataset(dataset) 105 106 def read_files( 107 self, 108 project: str, 109 dataset: str, 110 glob: str = None, 111 pattern: str = None, 112 filetype: str = None, 113 **kwargs 114 ): 115 """ 116 Read the contents of files from a dataset. 117 118 The project and dataset can each be identified by name or ID. 119 Exactly one of ``glob`` or ``pattern`` must be provided. 120 121 **glob** — standard wildcard matching; yields the file content for each 122 matching file: 123 124 - ``*`` matches any characters within a single path segment 125 - ``**`` matches zero or more path segments 126 - Matching is suffix-anchored (``*.csv`` matches at any depth) 127 128 **pattern** — like ``glob`` but ``{name}`` placeholders capture portions 129 of the path automatically; yields ``(content, meta)`` pairs where 130 *meta* is a ``dict`` of extracted values: 131 132 - ``{name}`` captures one path segment (no ``/``) 133 - ``*`` and ``**`` wildcards work as in ``glob`` 134 135 Args: 136 project (str): ID or name of the project. 137 dataset (str): ID or name of the dataset. 138 glob (str): Wildcard expression to match files 139 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 140 Yields one item per matching file: the parsed content. 141 pattern (str): Wildcard expression with ``{name}`` capture 142 placeholders (e.g., ``'{sample}.csv'``, 143 ``'{condition}/{sample}.csv'``). 144 Yields ``(content, meta)`` per matching file. 145 filetype (str): File format used to parse each file. Supported values: 146 147 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` 148 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) 149 - ``'json'``: parse with :func:`json.loads`, returns a Python object 150 - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` 151 (requires ``pyarrow`` or ``fastparquet``) 152 - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` 153 (requires ``pyarrow``) 154 - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object 155 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` 156 (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) 157 - ``'text'``: read as plain text, returns a ``str`` 158 - ``'bytes'``: read as raw bytes, returns ``bytes`` 159 - ``None`` (default): infer from file extension 160 (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, 161 ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, 162 ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, 163 ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) 164 **kwargs: Additional keyword arguments forwarded to the file-parsing 165 function (e.g., ``sep='\\t'`` for CSV/TSV files). 166 167 Yields: 168 - When using ``glob``: *content* for each matching file 169 - When using ``pattern``: ``(content, meta)`` for each matching file, 170 where *meta* is a ``dict`` of values extracted from ``{name}`` 171 placeholders 172 173 Raises: 174 DataPortalInputError: if both ``glob`` and ``pattern`` are provided, 175 or if neither is provided. 176 177 Example: 178 ```python 179 # Read all CSV files — just the content 180 for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): 181 print(df.shape) 182 183 # Extract sample names from filenames automatically 184 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): 185 print(meta['sample'], df.shape) 186 187 # Multi-level capture: condition directory + sample filename 188 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): 189 print(meta['condition'], meta['sample'], df.shape) 190 191 # Read gzip-compressed TSV files with explicit separator 192 for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): 193 print(df.shape) 194 ``` 195 """ 196 ds = self.get_dataset(project=project, dataset=dataset) 197 yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs) 198 199 def read_file( 200 self, 201 project: str, 202 dataset: str, 203 path: str = None, 204 glob: str = None, 205 filetype: str = None, 206 **kwargs 207 ): 208 """ 209 Read the contents of a single file from a dataset. 210 211 The project and dataset can each be identified by name or ID. 212 Provide either ``path`` (exact relative path) or ``glob`` (wildcard 213 expression). If ``glob`` is used it must match exactly one file. 214 215 Args: 216 project (str): ID or name of the project. 217 dataset (str): ID or name of the dataset. 218 path (str): Exact relative path of the file within the dataset. 219 glob (str): Wildcard expression matching exactly one file. 220 filetype (str): File format used to parse the file. Supported values 221 are the same as :meth:`read_files`. 222 **kwargs: Additional keyword arguments forwarded to the 223 file-parsing function. 224 225 Returns: 226 Parsed file content. 227 228 Raises: 229 DataPortalInputError: if both or neither of ``path``/``glob`` are 230 provided, or if ``glob`` matches zero or more than one file. 231 """ 232 ds = self.get_dataset(project=project, dataset=dataset) 233 return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs) 234 235 def list_processes(self, ingest=False) -> DataPortalProcesses: 236 """ 237 List all the processes available in the Data Portal. 238 By default, only list non-ingest processes (those which can be run on existing datasets). 239 To list the processes which can be used to upload datasets, use `ingest = True`. 240 241 Args: 242 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 243 """ 244 245 return DataPortalProcesses( 246 [ 247 DataPortalProcess(p, self._client) 248 for p in self._client.processes.list() 249 if not ingest or p.executor == Executor.INGEST 250 ] 251 ) 252 253 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 254 """ 255 Return the process with the specified name. 256 257 Args: 258 name (str): Name of process 259 """ 260 261 return self.list_processes(ingest=ingest).get_by_name(name) 262 263 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 264 """ 265 Return the process with the specified id 266 267 Args: 268 id (str): ID of process 269 """ 270 271 return self.list_processes(ingest=ingest).get_by_id(id) 272 273 def list_reference_types(self) -> DataPortalReferenceTypes: 274 """ 275 Return the list of all available reference types 276 """ 277 278 return DataPortalReferenceTypes( 279 [ 280 DataPortalReferenceType(ref) 281 for ref in self._client.references.get_types() 282 ] 283 ) 284 285 @property 286 def developer_helper(self) -> DeveloperHelper: 287 return DeveloperHelper(self._client)
Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.
19 def __init__(self, base_url: str = None, client: CirroApi = None): 20 """ 21 Set up the DataPortal object, establishing an authenticated connection. 22 23 Args: 24 base_url (str): Optional base URL of the Cirro instance 25 (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file) 26 client (`cirro.cirro_client.CirroApi`): Optional pre-configured client 27 28 Example: 29 ```python 30 from cirro import DataPortal 31 32 portal = DataPortal(base_url="app.cirro.bio") 33 portal.list_projects() 34 ``` 35 """ 36 37 if client is not None: 38 self._client = client 39 40 # Set up default client if not provided 41 else: 42 self._client = CirroApi(base_url=base_url)
Set up the DataPortal object, establishing an authenticated connection.
Arguments:
- base_url (str): Optional base URL of the Cirro instance
(if not provided, it uses the
CIRRO_BASE_URLenvironment variable, or the config file) - client (
cirro.cirro_client.CirroApi): Optional pre-configured client
Example:
from cirro import DataPortal
portal = DataPortal(base_url="app.cirro.bio")
portal.list_projects()
44 def list_projects(self) -> DataPortalProjects: 45 """List all the projects available in the Data Portal.""" 46 47 return DataPortalProjects( 48 [ 49 DataPortalProject(proj, self._client) 50 for proj in self._client.projects.list() 51 ] 52 )
List all the projects available in the Data Portal.
54 def get_project_by_name(self, name: str = None) -> DataPortalProject: 55 """Return the project with the specified name.""" 56 57 return self.list_projects().get_by_name(name)
Return the project with the specified name.
59 def get_project_by_id(self, _id: str = None) -> DataPortalProject: 60 """Return the project with the specified id.""" 61 62 return self.list_projects().get_by_id(_id)
Return the project with the specified id.
64 def get_project(self, project: str = None) -> DataPortalProject: 65 """ 66 Return a project identified by ID or name. 67 68 Args: 69 project (str): ID or name of project 70 71 Returns: 72 `from cirro.sdk.project import DataPortalProject` 73 """ 74 try: 75 return self.get_project_by_id(project) 76 except DataPortalAssetNotFound: 77 return self.get_project_by_name(project)
Return a project identified by ID or name.
Arguments:
- project (str): ID or name of project
Returns:
from cirro.sdk.project import DataPortalProject
79 def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset: 80 """ 81 Return a dataset identified by ID or name. 82 83 Args: 84 project (str): ID or name of project 85 dataset (str): ID or name of dataset 86 87 Returns: 88 `cirro.sdk.dataset.DataPortalDataset` 89 90 ```python 91 from cirro import DataPortal() 92 portal = DataPortal() 93 dataset = portal.get_dataset( 94 project="id-or-name-of-project", 95 dataset="id-or-name-of-dataset" 96 ) 97 ``` 98 """ 99 try: 100 project: DataPortalProject = self.get_project_by_id(project) 101 except DataPortalAssetNotFound: 102 project: DataPortalProject = self.get_project_by_name(project) 103 104 return project.get_dataset(dataset)
Return a dataset identified by ID or name.
Arguments:
- project (str): ID or name of project
- dataset (str): ID or name of dataset
Returns:
cirro.sdk.dataset.DataPortalDatasetfrom cirro import DataPortal() portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", dataset="id-or-name-of-dataset" )
106 def read_files( 107 self, 108 project: str, 109 dataset: str, 110 glob: str = None, 111 pattern: str = None, 112 filetype: str = None, 113 **kwargs 114 ): 115 """ 116 Read the contents of files from a dataset. 117 118 The project and dataset can each be identified by name or ID. 119 Exactly one of ``glob`` or ``pattern`` must be provided. 120 121 **glob** — standard wildcard matching; yields the file content for each 122 matching file: 123 124 - ``*`` matches any characters within a single path segment 125 - ``**`` matches zero or more path segments 126 - Matching is suffix-anchored (``*.csv`` matches at any depth) 127 128 **pattern** — like ``glob`` but ``{name}`` placeholders capture portions 129 of the path automatically; yields ``(content, meta)`` pairs where 130 *meta* is a ``dict`` of extracted values: 131 132 - ``{name}`` captures one path segment (no ``/``) 133 - ``*`` and ``**`` wildcards work as in ``glob`` 134 135 Args: 136 project (str): ID or name of the project. 137 dataset (str): ID or name of the dataset. 138 glob (str): Wildcard expression to match files 139 (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). 140 Yields one item per matching file: the parsed content. 141 pattern (str): Wildcard expression with ``{name}`` capture 142 placeholders (e.g., ``'{sample}.csv'``, 143 ``'{condition}/{sample}.csv'``). 144 Yields ``(content, meta)`` per matching file. 145 filetype (str): File format used to parse each file. Supported values: 146 147 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` 148 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) 149 - ``'json'``: parse with :func:`json.loads`, returns a Python object 150 - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` 151 (requires ``pyarrow`` or ``fastparquet``) 152 - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` 153 (requires ``pyarrow``) 154 - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object 155 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` 156 (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) 157 - ``'text'``: read as plain text, returns a ``str`` 158 - ``'bytes'``: read as raw bytes, returns ``bytes`` 159 - ``None`` (default): infer from file extension 160 (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, 161 ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, 162 ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, 163 ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) 164 **kwargs: Additional keyword arguments forwarded to the file-parsing 165 function (e.g., ``sep='\\t'`` for CSV/TSV files). 166 167 Yields: 168 - When using ``glob``: *content* for each matching file 169 - When using ``pattern``: ``(content, meta)`` for each matching file, 170 where *meta* is a ``dict`` of values extracted from ``{name}`` 171 placeholders 172 173 Raises: 174 DataPortalInputError: if both ``glob`` and ``pattern`` are provided, 175 or if neither is provided. 176 177 Example: 178 ```python 179 # Read all CSV files — just the content 180 for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): 181 print(df.shape) 182 183 # Extract sample names from filenames automatically 184 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): 185 print(meta['sample'], df.shape) 186 187 # Multi-level capture: condition directory + sample filename 188 for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): 189 print(meta['condition'], meta['sample'], df.shape) 190 191 # Read gzip-compressed TSV files with explicit separator 192 for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): 193 print(df.shape) 194 ``` 195 """ 196 ds = self.get_dataset(project=project, dataset=dataset) 197 yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
Read the contents of files from a dataset.
The project and dataset can each be identified by name or ID.
Exactly one of glob or pattern must be provided.
glob — standard wildcard matching; yields the file content for each matching file:
*matches any characters within a single path segment**matches zero or more path segments- Matching is suffix-anchored (
*.csvmatches at any depth)
pattern — like glob but {name} placeholders capture portions
of the path automatically; yields (content, meta) pairs where
meta is a dict of extracted values:
{name}captures one path segment (no/)*and**wildcards work as inglob
Arguments:
- project (str): ID or name of the project.
- dataset (str): ID or name of the dataset.
- glob (str): Wildcard expression to match files
(e.g.,
'*.csv','data/**/*.tsv.gz'). Yields one item per matching file: the parsed content. - pattern (str): Wildcard expression with
{name}capture placeholders (e.g.,'{sample}.csv','{condition}/{sample}.csv'). Yields(content, meta)per matching file. filetype (str): File format used to parse each file. Supported values:
'csv': parse withpandas.read_csv(), returns aDataFrame'h5ad': parse as AnnData (requiresanndatapackage)'json': parse withjson.loads(), returns a Python object'parquet': parse withpandas.read_parquet(), returns aDataFrame(requirespyarroworfastparquet)'feather': parse withpandas.read_feather(), returns aDataFrame(requirespyarrow)'pickle': deserialize withpickle, returns a Python object'excel': parse withpandas.read_excel(), returns aDataFrame(requiresopenpyxlfor.xlsxorxlrdfor.xls)'text': read as plain text, returns astr'bytes': read as raw bytes, returnsbytesNone(default): infer from file extension (.csv/.tsv→'csv',.h5ad→'h5ad',.json→'json',.parquet→'parquet',.feather→'feather',.pkl/.pickle→'pickle',.xlsx/.xls→'excel', otherwise'text')
- **kwargs: Additional keyword arguments forwarded to the file-parsing
function (e.g.,
sep='\t'for CSV/TSV files).
Yields:
- When using
glob: content for each matching file- When using
pattern:(content, meta)for each matching file, where meta is adictof values extracted from{name}placeholders
Raises:
- DataPortalInputError: if both
globandpatternare provided, or if neither is provided.
Example:
# Read all CSV files — just the content for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): print(df.shape) # Extract sample names from filenames automatically for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): print(meta['sample'], df.shape) # Multi-level capture: condition directory + sample filename for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): print(meta['condition'], meta['sample'], df.shape) # Read gzip-compressed TSV files with explicit separator for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\t'): print(df.shape)
199 def read_file( 200 self, 201 project: str, 202 dataset: str, 203 path: str = None, 204 glob: str = None, 205 filetype: str = None, 206 **kwargs 207 ): 208 """ 209 Read the contents of a single file from a dataset. 210 211 The project and dataset can each be identified by name or ID. 212 Provide either ``path`` (exact relative path) or ``glob`` (wildcard 213 expression). If ``glob`` is used it must match exactly one file. 214 215 Args: 216 project (str): ID or name of the project. 217 dataset (str): ID or name of the dataset. 218 path (str): Exact relative path of the file within the dataset. 219 glob (str): Wildcard expression matching exactly one file. 220 filetype (str): File format used to parse the file. Supported values 221 are the same as :meth:`read_files`. 222 **kwargs: Additional keyword arguments forwarded to the 223 file-parsing function. 224 225 Returns: 226 Parsed file content. 227 228 Raises: 229 DataPortalInputError: if both or neither of ``path``/``glob`` are 230 provided, or if ``glob`` matches zero or more than one file. 231 """ 232 ds = self.get_dataset(project=project, dataset=dataset) 233 return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
Read the contents of a single file from a dataset.
The project and dataset can each be identified by name or ID.
Provide either path (exact relative path) or glob (wildcard
expression). If glob is used it must match exactly one file.
Arguments:
- project (str): ID or name of the project.
- dataset (str): ID or name of the dataset.
- path (str): Exact relative path of the file within the dataset.
- glob (str): Wildcard expression matching exactly one file.
- filetype (str): File format used to parse the file. Supported values
are the same as
read_files(). - **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Returns:
Parsed file content.
Raises:
- DataPortalInputError: if both or neither of
path/globare provided, or ifglobmatches zero or more than one file.
235 def list_processes(self, ingest=False) -> DataPortalProcesses: 236 """ 237 List all the processes available in the Data Portal. 238 By default, only list non-ingest processes (those which can be run on existing datasets). 239 To list the processes which can be used to upload datasets, use `ingest = True`. 240 241 Args: 242 ingest (bool): If True, only list those processes which can be used to ingest datasets directly 243 """ 244 245 return DataPortalProcesses( 246 [ 247 DataPortalProcess(p, self._client) 248 for p in self._client.processes.list() 249 if not ingest or p.executor == Executor.INGEST 250 ] 251 )
List all the processes available in the Data Portal.
By default, only list non-ingest processes (those which can be run on existing datasets).
To list the processes which can be used to upload datasets, use ingest = True.
Arguments:
- ingest (bool): If True, only list those processes which can be used to ingest datasets directly
253 def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess: 254 """ 255 Return the process with the specified name. 256 257 Args: 258 name (str): Name of process 259 """ 260 261 return self.list_processes(ingest=ingest).get_by_name(name)
Return the process with the specified name.
Arguments:
- name (str): Name of process
263 def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess: 264 """ 265 Return the process with the specified id 266 267 Args: 268 id (str): ID of process 269 """ 270 271 return self.list_processes(ingest=ingest).get_by_id(id)
Return the process with the specified id
Arguments:
- id (str): ID of process
273 def list_reference_types(self) -> DataPortalReferenceTypes: 274 """ 275 Return the list of all available reference types 276 """ 277 278 return DataPortalReferenceTypes( 279 [ 280 DataPortalReferenceType(ref) 281 for ref in self._client.references.get_types() 282 ] 283 )
Return the list of all available reference types