cirro.sdk.portal

  1from cirro_api_client.v1.models import Executor
  2
  3from cirro.cirro_client import CirroApi
  4from cirro.sdk.dataset import DataPortalDataset
  5from cirro.sdk.developer import DeveloperHelper
  6from cirro.sdk.exceptions import DataPortalAssetNotFound
  7from cirro.sdk.process import DataPortalProcess, DataPortalProcesses
  8from cirro.sdk.project import DataPortalProject, DataPortalProjects
  9from cirro.sdk.reference_type import DataPortalReferenceType, DataPortalReferenceTypes
 10
 11
 12class DataPortal:
 13    """
 14    Helper functions for exploring the Projects, Datasets, Samples, and Files
 15    available in the Data Portal.
 16    """
 17
 18    def __init__(self, base_url: str = None, client: CirroApi = None):
 19        """
 20        Set up the DataPortal object, establishing an authenticated connection.
 21
 22        Args:
 23            base_url (str): Optional base URL of the Cirro instance
 24             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
 25            client (`cirro.cirro_client.CirroApi`): Optional pre-configured client
 26
 27        Example:
 28        ```python
 29        from cirro import DataPortal
 30
 31        portal = DataPortal(base_url="app.cirro.bio")
 32        portal.list_projects()
 33        ```
 34        """
 35
 36        if client is not None:
 37            self._client = client
 38
 39        # Set up default client if not provided
 40        else:
 41            self._client = CirroApi(base_url=base_url)
 42
 43    def list_projects(self) -> DataPortalProjects:
 44        """List all the projects available in the Data Portal."""
 45
 46        return DataPortalProjects(
 47            [
 48                DataPortalProject(proj, self._client)
 49                for proj in self._client.projects.list()
 50            ]
 51        )
 52
 53    def get_project_by_name(self, name: str = None) -> DataPortalProject:
 54        """Return the project with the specified name."""
 55
 56        return self.list_projects().get_by_name(name)
 57
 58    def get_project_by_id(self, _id: str = None) -> DataPortalProject:
 59        """Return the project with the specified id."""
 60
 61        return self.list_projects().get_by_id(_id)
 62
 63    def get_project(self, project: str = None) -> DataPortalProject:
 64        """
 65        Return a project identified by ID or name.
 66
 67        Args:
 68            project (str): ID or name of project
 69
 70        Returns:
 71            `from cirro.sdk.project import DataPortalProject`
 72        """
 73        try:
 74            return self.get_project_by_id(project)
 75        except DataPortalAssetNotFound:
 76            return self.get_project_by_name(project)
 77
 78    def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset:
 79        """
 80        Return a dataset identified by ID or name.
 81
 82        Args:
 83            project (str): ID or name of project
 84            dataset (str): ID or name of dataset
 85
 86        Returns:
 87            `cirro.sdk.dataset.DataPortalDataset`
 88
 89            ```python
 90            from cirro import DataPortal()
 91            portal = DataPortal()
 92            dataset = portal.get_dataset(
 93                project="id-or-name-of-project",
 94                dataset="id-or-name-of-dataset"
 95            )
 96            ```
 97        """
 98        try:
 99            project: DataPortalProject = self.get_project_by_id(project)
100        except DataPortalAssetNotFound:
101            project: DataPortalProject = self.get_project_by_name(project)
102
103        return project.get_dataset(dataset)
104
105    def read_files(
106            self,
107            project: str,
108            dataset: str,
109            glob: str = None,
110            pattern: str = None,
111            filetype: str = None,
112            **kwargs
113    ):
114        """
115        Read the contents of files from a dataset.
116
117        The project and dataset can each be identified by name or ID.
118        Exactly one of ``glob`` or ``pattern`` must be provided.
119
120        **glob** — standard wildcard matching; yields the file content for each
121        matching file:
122
123        - ``*`` matches any characters within a single path segment
124        - ``**`` matches zero or more path segments
125        - Matching is suffix-anchored (``*.csv`` matches at any depth)
126
127        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
128        of the path automatically; yields ``(content, meta)`` pairs where
129        *meta* is a ``dict`` of extracted values:
130
131        - ``{name}`` captures one path segment (no ``/``)
132        - ``*`` and ``**`` wildcards work as in ``glob``
133
134        Args:
135            project (str): ID or name of the project.
136            dataset (str): ID or name of the dataset.
137            glob (str): Wildcard expression to match files
138                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
139                Yields one item per matching file: the parsed content.
140            pattern (str): Wildcard expression with ``{name}`` capture
141                placeholders (e.g., ``'{sample}.csv'``,
142                ``'{condition}/{sample}.csv'``).
143                Yields ``(content, meta)`` per matching file.
144            filetype (str): File format used to parse each file. Supported values:
145
146                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
147                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
148                - ``'json'``: parse with :func:`json.loads`, returns a Python object
149                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
150                  (requires ``pyarrow`` or ``fastparquet``)
151                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
152                  (requires ``pyarrow``)
153                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
154                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
155                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
156                - ``'text'``: read as plain text, returns a ``str``
157                - ``'bytes'``: read as raw bytes, returns ``bytes``
158                - ``None`` (default): infer from file extension
159                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
160                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
161                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
162                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
163            **kwargs: Additional keyword arguments forwarded to the file-parsing
164                function (e.g., ``sep='\\t'`` for CSV/TSV files).
165
166        Yields:
167            - When using ``glob``: *content* for each matching file
168            - When using ``pattern``: ``(content, meta)`` for each matching file,
169              where *meta* is a ``dict`` of values extracted from ``{name}``
170              placeholders
171
172        Raises:
173            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
174                or if neither is provided.
175
176        Example:
177            ```python
178            # Read all CSV files — just the content
179            for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
180                print(df.shape)
181
182            # Extract sample names from filenames automatically
183            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
184                print(meta['sample'], df.shape)
185
186            # Multi-level capture: condition directory + sample filename
187            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
188                print(meta['condition'], meta['sample'], df.shape)
189
190            # Read gzip-compressed TSV files with explicit separator
191            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
192                print(df.shape)
193            ```
194        """
195        ds = self.get_dataset(project=project, dataset=dataset)
196        yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
197
198    def read_file(
199            self,
200            project: str,
201            dataset: str,
202            path: str = None,
203            glob: str = None,
204            filetype: str = None,
205            **kwargs
206    ):
207        """
208        Read the contents of a single file from a dataset.
209
210        The project and dataset can each be identified by name or ID.
211        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
212        expression). If ``glob`` is used it must match exactly one file.
213
214        Args:
215            project (str): ID or name of the project.
216            dataset (str): ID or name of the dataset.
217            path (str): Exact relative path of the file within the dataset.
218            glob (str): Wildcard expression matching exactly one file.
219            filetype (str): File format used to parse the file. Supported values
220                are the same as :meth:`read_files`.
221            **kwargs: Additional keyword arguments forwarded to the
222                file-parsing function.
223
224        Returns:
225            Parsed file content.
226
227        Raises:
228            DataPortalInputError: if both or neither of ``path``/``glob`` are
229                provided, or if ``glob`` matches zero or more than one file.
230        """
231        ds = self.get_dataset(project=project, dataset=dataset)
232        return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
233
234    def list_processes(self, ingest=False) -> DataPortalProcesses:
235        """
236        List all the processes available in the Data Portal.
237        By default, only list non-ingest processes (those which can be run on existing datasets).
238        To list the processes which can be used to upload datasets, use `ingest = True`.
239
240        Args:
241            ingest (bool): If True, only list those processes which can be used to ingest datasets directly
242        """
243
244        return DataPortalProcesses(
245            [
246                DataPortalProcess(p, self._client)
247                for p in self._client.processes.list()
248                if not ingest or p.executor == Executor.INGEST
249            ]
250        )
251
252    def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
253        """
254        Return the process with the specified name.
255
256        Args:
257            name (str): Name of process
258        """
259
260        return self.list_processes(ingest=ingest).get_by_name(name)
261
262    def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
263        """
264        Return the process with the specified id
265
266        Args:
267            id (str): ID of process
268        """
269
270        return self.list_processes(ingest=ingest).get_by_id(id)
271
272    def list_reference_types(self) -> DataPortalReferenceTypes:
273        """
274        Return the list of all available reference types
275        """
276
277        return DataPortalReferenceTypes(
278            [
279                DataPortalReferenceType(ref)
280                for ref in self._client.references.get_types()
281            ]
282        )
283
284    @property
285    def developer_helper(self) -> DeveloperHelper:
286        return DeveloperHelper(self._client)
class DataPortal:
 13class DataPortal:
 14    """
 15    Helper functions for exploring the Projects, Datasets, Samples, and Files
 16    available in the Data Portal.
 17    """
 18
 19    def __init__(self, base_url: str = None, client: CirroApi = None):
 20        """
 21        Set up the DataPortal object, establishing an authenticated connection.
 22
 23        Args:
 24            base_url (str): Optional base URL of the Cirro instance
 25             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
 26            client (`cirro.cirro_client.CirroApi`): Optional pre-configured client
 27
 28        Example:
 29        ```python
 30        from cirro import DataPortal
 31
 32        portal = DataPortal(base_url="app.cirro.bio")
 33        portal.list_projects()
 34        ```
 35        """
 36
 37        if client is not None:
 38            self._client = client
 39
 40        # Set up default client if not provided
 41        else:
 42            self._client = CirroApi(base_url=base_url)
 43
 44    def list_projects(self) -> DataPortalProjects:
 45        """List all the projects available in the Data Portal."""
 46
 47        return DataPortalProjects(
 48            [
 49                DataPortalProject(proj, self._client)
 50                for proj in self._client.projects.list()
 51            ]
 52        )
 53
 54    def get_project_by_name(self, name: str = None) -> DataPortalProject:
 55        """Return the project with the specified name."""
 56
 57        return self.list_projects().get_by_name(name)
 58
 59    def get_project_by_id(self, _id: str = None) -> DataPortalProject:
 60        """Return the project with the specified id."""
 61
 62        return self.list_projects().get_by_id(_id)
 63
 64    def get_project(self, project: str = None) -> DataPortalProject:
 65        """
 66        Return a project identified by ID or name.
 67
 68        Args:
 69            project (str): ID or name of project
 70
 71        Returns:
 72            `from cirro.sdk.project import DataPortalProject`
 73        """
 74        try:
 75            return self.get_project_by_id(project)
 76        except DataPortalAssetNotFound:
 77            return self.get_project_by_name(project)
 78
 79    def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset:
 80        """
 81        Return a dataset identified by ID or name.
 82
 83        Args:
 84            project (str): ID or name of project
 85            dataset (str): ID or name of dataset
 86
 87        Returns:
 88            `cirro.sdk.dataset.DataPortalDataset`
 89
 90            ```python
 91            from cirro import DataPortal()
 92            portal = DataPortal()
 93            dataset = portal.get_dataset(
 94                project="id-or-name-of-project",
 95                dataset="id-or-name-of-dataset"
 96            )
 97            ```
 98        """
 99        try:
100            project: DataPortalProject = self.get_project_by_id(project)
101        except DataPortalAssetNotFound:
102            project: DataPortalProject = self.get_project_by_name(project)
103
104        return project.get_dataset(dataset)
105
106    def read_files(
107            self,
108            project: str,
109            dataset: str,
110            glob: str = None,
111            pattern: str = None,
112            filetype: str = None,
113            **kwargs
114    ):
115        """
116        Read the contents of files from a dataset.
117
118        The project and dataset can each be identified by name or ID.
119        Exactly one of ``glob`` or ``pattern`` must be provided.
120
121        **glob** — standard wildcard matching; yields the file content for each
122        matching file:
123
124        - ``*`` matches any characters within a single path segment
125        - ``**`` matches zero or more path segments
126        - Matching is suffix-anchored (``*.csv`` matches at any depth)
127
128        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
129        of the path automatically; yields ``(content, meta)`` pairs where
130        *meta* is a ``dict`` of extracted values:
131
132        - ``{name}`` captures one path segment (no ``/``)
133        - ``*`` and ``**`` wildcards work as in ``glob``
134
135        Args:
136            project (str): ID or name of the project.
137            dataset (str): ID or name of the dataset.
138            glob (str): Wildcard expression to match files
139                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
140                Yields one item per matching file: the parsed content.
141            pattern (str): Wildcard expression with ``{name}`` capture
142                placeholders (e.g., ``'{sample}.csv'``,
143                ``'{condition}/{sample}.csv'``).
144                Yields ``(content, meta)`` per matching file.
145            filetype (str): File format used to parse each file. Supported values:
146
147                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
148                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
149                - ``'json'``: parse with :func:`json.loads`, returns a Python object
150                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
151                  (requires ``pyarrow`` or ``fastparquet``)
152                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
153                  (requires ``pyarrow``)
154                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
155                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
156                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
157                - ``'text'``: read as plain text, returns a ``str``
158                - ``'bytes'``: read as raw bytes, returns ``bytes``
159                - ``None`` (default): infer from file extension
160                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
161                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
162                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
163                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
164            **kwargs: Additional keyword arguments forwarded to the file-parsing
165                function (e.g., ``sep='\\t'`` for CSV/TSV files).
166
167        Yields:
168            - When using ``glob``: *content* for each matching file
169            - When using ``pattern``: ``(content, meta)`` for each matching file,
170              where *meta* is a ``dict`` of values extracted from ``{name}``
171              placeholders
172
173        Raises:
174            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
175                or if neither is provided.
176
177        Example:
178            ```python
179            # Read all CSV files — just the content
180            for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
181                print(df.shape)
182
183            # Extract sample names from filenames automatically
184            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
185                print(meta['sample'], df.shape)
186
187            # Multi-level capture: condition directory + sample filename
188            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
189                print(meta['condition'], meta['sample'], df.shape)
190
191            # Read gzip-compressed TSV files with explicit separator
192            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
193                print(df.shape)
194            ```
195        """
196        ds = self.get_dataset(project=project, dataset=dataset)
197        yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
198
199    def read_file(
200            self,
201            project: str,
202            dataset: str,
203            path: str = None,
204            glob: str = None,
205            filetype: str = None,
206            **kwargs
207    ):
208        """
209        Read the contents of a single file from a dataset.
210
211        The project and dataset can each be identified by name or ID.
212        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
213        expression). If ``glob`` is used it must match exactly one file.
214
215        Args:
216            project (str): ID or name of the project.
217            dataset (str): ID or name of the dataset.
218            path (str): Exact relative path of the file within the dataset.
219            glob (str): Wildcard expression matching exactly one file.
220            filetype (str): File format used to parse the file. Supported values
221                are the same as :meth:`read_files`.
222            **kwargs: Additional keyword arguments forwarded to the
223                file-parsing function.
224
225        Returns:
226            Parsed file content.
227
228        Raises:
229            DataPortalInputError: if both or neither of ``path``/``glob`` are
230                provided, or if ``glob`` matches zero or more than one file.
231        """
232        ds = self.get_dataset(project=project, dataset=dataset)
233        return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
234
235    def list_processes(self, ingest=False) -> DataPortalProcesses:
236        """
237        List all the processes available in the Data Portal.
238        By default, only list non-ingest processes (those which can be run on existing datasets).
239        To list the processes which can be used to upload datasets, use `ingest = True`.
240
241        Args:
242            ingest (bool): If True, only list those processes which can be used to ingest datasets directly
243        """
244
245        return DataPortalProcesses(
246            [
247                DataPortalProcess(p, self._client)
248                for p in self._client.processes.list()
249                if not ingest or p.executor == Executor.INGEST
250            ]
251        )
252
253    def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
254        """
255        Return the process with the specified name.
256
257        Args:
258            name (str): Name of process
259        """
260
261        return self.list_processes(ingest=ingest).get_by_name(name)
262
263    def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
264        """
265        Return the process with the specified id
266
267        Args:
268            id (str): ID of process
269        """
270
271        return self.list_processes(ingest=ingest).get_by_id(id)
272
273    def list_reference_types(self) -> DataPortalReferenceTypes:
274        """
275        Return the list of all available reference types
276        """
277
278        return DataPortalReferenceTypes(
279            [
280                DataPortalReferenceType(ref)
281                for ref in self._client.references.get_types()
282            ]
283        )
284
285    @property
286    def developer_helper(self) -> DeveloperHelper:
287        return DeveloperHelper(self._client)

Helper functions for exploring the Projects, Datasets, Samples, and Files available in the Data Portal.

DataPortal(base_url: str = None, client: cirro.CirroApi = None)
19    def __init__(self, base_url: str = None, client: CirroApi = None):
20        """
21        Set up the DataPortal object, establishing an authenticated connection.
22
23        Args:
24            base_url (str): Optional base URL of the Cirro instance
25             (if not provided, it uses the `CIRRO_BASE_URL` environment variable, or the config file)
26            client (`cirro.cirro_client.CirroApi`): Optional pre-configured client
27
28        Example:
29        ```python
30        from cirro import DataPortal
31
32        portal = DataPortal(base_url="app.cirro.bio")
33        portal.list_projects()
34        ```
35        """
36
37        if client is not None:
38            self._client = client
39
40        # Set up default client if not provided
41        else:
42            self._client = CirroApi(base_url=base_url)

Set up the DataPortal object, establishing an authenticated connection.

Arguments:
  • base_url (str): Optional base URL of the Cirro instance (if not provided, it uses the CIRRO_BASE_URL environment variable, or the config file)
  • client (cirro.cirro_client.CirroApi): Optional pre-configured client

Example:

from cirro import DataPortal

portal = DataPortal(base_url="app.cirro.bio")
portal.list_projects()
def list_projects(self) -> cirro.sdk.project.DataPortalProjects:
44    def list_projects(self) -> DataPortalProjects:
45        """List all the projects available in the Data Portal."""
46
47        return DataPortalProjects(
48            [
49                DataPortalProject(proj, self._client)
50                for proj in self._client.projects.list()
51            ]
52        )

List all the projects available in the Data Portal.

def get_project_by_name(self, name: str = None) -> cirro.DataPortalProject:
54    def get_project_by_name(self, name: str = None) -> DataPortalProject:
55        """Return the project with the specified name."""
56
57        return self.list_projects().get_by_name(name)

Return the project with the specified name.

def get_project_by_id(self, _id: str = None) -> cirro.DataPortalProject:
59    def get_project_by_id(self, _id: str = None) -> DataPortalProject:
60        """Return the project with the specified id."""
61
62        return self.list_projects().get_by_id(_id)

Return the project with the specified id.

def get_project(self, project: str = None) -> cirro.DataPortalProject:
64    def get_project(self, project: str = None) -> DataPortalProject:
65        """
66        Return a project identified by ID or name.
67
68        Args:
69            project (str): ID or name of project
70
71        Returns:
72            `from cirro.sdk.project import DataPortalProject`
73        """
74        try:
75            return self.get_project_by_id(project)
76        except DataPortalAssetNotFound:
77            return self.get_project_by_name(project)

Return a project identified by ID or name.

Arguments:
  • project (str): ID or name of project
Returns:

from cirro.sdk.project import DataPortalProject

def get_dataset( self, project: str = None, dataset: str = None) -> cirro.DataPortalDataset:
 79    def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDataset:
 80        """
 81        Return a dataset identified by ID or name.
 82
 83        Args:
 84            project (str): ID or name of project
 85            dataset (str): ID or name of dataset
 86
 87        Returns:
 88            `cirro.sdk.dataset.DataPortalDataset`
 89
 90            ```python
 91            from cirro import DataPortal()
 92            portal = DataPortal()
 93            dataset = portal.get_dataset(
 94                project="id-or-name-of-project",
 95                dataset="id-or-name-of-dataset"
 96            )
 97            ```
 98        """
 99        try:
100            project: DataPortalProject = self.get_project_by_id(project)
101        except DataPortalAssetNotFound:
102            project: DataPortalProject = self.get_project_by_name(project)
103
104        return project.get_dataset(dataset)

Return a dataset identified by ID or name.

Arguments:
  • project (str): ID or name of project
  • dataset (str): ID or name of dataset
Returns:

cirro.sdk.dataset.DataPortalDataset

from cirro import DataPortal()
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
def read_files( self, project: str, dataset: str, glob: str = None, pattern: str = None, filetype: str = None, **kwargs):
106    def read_files(
107            self,
108            project: str,
109            dataset: str,
110            glob: str = None,
111            pattern: str = None,
112            filetype: str = None,
113            **kwargs
114    ):
115        """
116        Read the contents of files from a dataset.
117
118        The project and dataset can each be identified by name or ID.
119        Exactly one of ``glob`` or ``pattern`` must be provided.
120
121        **glob** — standard wildcard matching; yields the file content for each
122        matching file:
123
124        - ``*`` matches any characters within a single path segment
125        - ``**`` matches zero or more path segments
126        - Matching is suffix-anchored (``*.csv`` matches at any depth)
127
128        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
129        of the path automatically; yields ``(content, meta)`` pairs where
130        *meta* is a ``dict`` of extracted values:
131
132        - ``{name}`` captures one path segment (no ``/``)
133        - ``*`` and ``**`` wildcards work as in ``glob``
134
135        Args:
136            project (str): ID or name of the project.
137            dataset (str): ID or name of the dataset.
138            glob (str): Wildcard expression to match files
139                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
140                Yields one item per matching file: the parsed content.
141            pattern (str): Wildcard expression with ``{name}`` capture
142                placeholders (e.g., ``'{sample}.csv'``,
143                ``'{condition}/{sample}.csv'``).
144                Yields ``(content, meta)`` per matching file.
145            filetype (str): File format used to parse each file. Supported values:
146
147                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
148                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
149                - ``'json'``: parse with :func:`json.loads`, returns a Python object
150                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
151                  (requires ``pyarrow`` or ``fastparquet``)
152                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
153                  (requires ``pyarrow``)
154                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
155                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
156                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
157                - ``'text'``: read as plain text, returns a ``str``
158                - ``'bytes'``: read as raw bytes, returns ``bytes``
159                - ``None`` (default): infer from file extension
160                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
161                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
162                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
163                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
164            **kwargs: Additional keyword arguments forwarded to the file-parsing
165                function (e.g., ``sep='\\t'`` for CSV/TSV files).
166
167        Yields:
168            - When using ``glob``: *content* for each matching file
169            - When using ``pattern``: ``(content, meta)`` for each matching file,
170              where *meta* is a ``dict`` of values extracted from ``{name}``
171              placeholders
172
173        Raises:
174            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
175                or if neither is provided.
176
177        Example:
178            ```python
179            # Read all CSV files — just the content
180            for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
181                print(df.shape)
182
183            # Extract sample names from filenames automatically
184            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
185                print(meta['sample'], df.shape)
186
187            # Multi-level capture: condition directory + sample filename
188            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
189                print(meta['condition'], meta['sample'], df.shape)
190
191            # Read gzip-compressed TSV files with explicit separator
192            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
193                print(df.shape)
194            ```
195        """
196        ds = self.get_dataset(project=project, dataset=dataset)
197        yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)

Read the contents of files from a dataset.

The project and dataset can each be identified by name or ID. Exactly one of glob or pattern must be provided.

glob — standard wildcard matching; yields the file content for each matching file:

  • * matches any characters within a single path segment
  • ** matches zero or more path segments
  • Matching is suffix-anchored (*.csv matches at any depth)

pattern — like glob but {name} placeholders capture portions of the path automatically; yields (content, meta) pairs where meta is a dict of extracted values:

  • {name} captures one path segment (no /)
  • * and ** wildcards work as in glob
Arguments:
  • project (str): ID or name of the project.
  • dataset (str): ID or name of the dataset.
  • glob (str): Wildcard expression to match files (e.g., '*.csv', 'data/**/*.tsv.gz'). Yields one item per matching file: the parsed content.
  • pattern (str): Wildcard expression with {name} capture placeholders (e.g., '{sample}.csv', '{condition}/{sample}.csv'). Yields (content, meta) per matching file.
  • filetype (str): File format used to parse each file. Supported values:

    • 'csv': parse with pandas.read_csv(), returns a DataFrame
    • 'h5ad': parse as AnnData (requires anndata package)
    • 'json': parse with json.loads(), returns a Python object
    • 'parquet': parse with pandas.read_parquet(), returns a DataFrame (requires pyarrow or fastparquet)
    • 'feather': parse with pandas.read_feather(), returns a DataFrame (requires pyarrow)
    • 'pickle': deserialize with pickle, returns a Python object
    • 'excel': parse with pandas.read_excel(), returns a DataFrame (requires openpyxl for .xlsx or xlrd for .xls)
    • 'text': read as plain text, returns a str
    • 'bytes': read as raw bytes, returns bytes
    • None (default): infer from file extension (.csv/.tsv'csv', .h5ad'h5ad', .json'json', .parquet'parquet', .feather'feather', .pkl/.pickle'pickle', .xlsx/.xls'excel', otherwise 'text')
  • **kwargs: Additional keyword arguments forwarded to the file-parsing function (e.g., sep='\t' for CSV/TSV files).
Yields:
  • When using glob: content for each matching file
  • When using pattern: (content, meta) for each matching file, where meta is a dict of values extracted from {name} placeholders
Raises:
  • DataPortalInputError: if both glob and pattern are provided, or if neither is provided.
Example:
# Read all CSV files — just the content
for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
    print(df.shape)

# Extract sample names from filenames automatically
for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
    print(meta['sample'], df.shape)

# Multi-level capture: condition directory + sample filename
for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
    print(meta['condition'], meta['sample'], df.shape)

# Read gzip-compressed TSV files with explicit separator
for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\t'):
    print(df.shape)
def read_file( self, project: str, dataset: str, path: str = None, glob: str = None, filetype: str = None, **kwargs):
199    def read_file(
200            self,
201            project: str,
202            dataset: str,
203            path: str = None,
204            glob: str = None,
205            filetype: str = None,
206            **kwargs
207    ):
208        """
209        Read the contents of a single file from a dataset.
210
211        The project and dataset can each be identified by name or ID.
212        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
213        expression). If ``glob`` is used it must match exactly one file.
214
215        Args:
216            project (str): ID or name of the project.
217            dataset (str): ID or name of the dataset.
218            path (str): Exact relative path of the file within the dataset.
219            glob (str): Wildcard expression matching exactly one file.
220            filetype (str): File format used to parse the file. Supported values
221                are the same as :meth:`read_files`.
222            **kwargs: Additional keyword arguments forwarded to the
223                file-parsing function.
224
225        Returns:
226            Parsed file content.
227
228        Raises:
229            DataPortalInputError: if both or neither of ``path``/``glob`` are
230                provided, or if ``glob`` matches zero or more than one file.
231        """
232        ds = self.get_dataset(project=project, dataset=dataset)
233        return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)

Read the contents of a single file from a dataset.

The project and dataset can each be identified by name or ID. Provide either path (exact relative path) or glob (wildcard expression). If glob is used it must match exactly one file.

Arguments:
  • project (str): ID or name of the project.
  • dataset (str): ID or name of the dataset.
  • path (str): Exact relative path of the file within the dataset.
  • glob (str): Wildcard expression matching exactly one file.
  • filetype (str): File format used to parse the file. Supported values are the same as read_files().
  • **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Returns:

Parsed file content.

Raises:
  • DataPortalInputError: if both or neither of path/glob are provided, or if glob matches zero or more than one file.
def list_processes(self, ingest=False) -> cirro.sdk.process.DataPortalProcesses:
235    def list_processes(self, ingest=False) -> DataPortalProcesses:
236        """
237        List all the processes available in the Data Portal.
238        By default, only list non-ingest processes (those which can be run on existing datasets).
239        To list the processes which can be used to upload datasets, use `ingest = True`.
240
241        Args:
242            ingest (bool): If True, only list those processes which can be used to ingest datasets directly
243        """
244
245        return DataPortalProcesses(
246            [
247                DataPortalProcess(p, self._client)
248                for p in self._client.processes.list()
249                if not ingest or p.executor == Executor.INGEST
250            ]
251        )

List all the processes available in the Data Portal. By default, only list non-ingest processes (those which can be run on existing datasets). To list the processes which can be used to upload datasets, use ingest = True.

Arguments:
  • ingest (bool): If True, only list those processes which can be used to ingest datasets directly
def get_process_by_name(self, name: str, ingest=False) -> cirro.DataPortalProcess:
253    def get_process_by_name(self, name: str, ingest=False) -> DataPortalProcess:
254        """
255        Return the process with the specified name.
256
257        Args:
258            name (str): Name of process
259        """
260
261        return self.list_processes(ingest=ingest).get_by_name(name)

Return the process with the specified name.

Arguments:
  • name (str): Name of process
def get_process_by_id(self, id: str, ingest=False) -> cirro.DataPortalProcess:
263    def get_process_by_id(self, id: str, ingest=False) -> DataPortalProcess:
264        """
265        Return the process with the specified id
266
267        Args:
268            id (str): ID of process
269        """
270
271        return self.list_processes(ingest=ingest).get_by_id(id)

Return the process with the specified id

Arguments:
  • id (str): ID of process
def list_reference_types(self) -> cirro.sdk.reference_type.DataPortalReferenceTypes:
273    def list_reference_types(self) -> DataPortalReferenceTypes:
274        """
275        Return the list of all available reference types
276        """
277
278        return DataPortalReferenceTypes(
279            [
280                DataPortalReferenceType(ref)
281                for ref in self._client.references.get_types()
282            ]
283        )

Return the list of all available reference types

developer_helper: cirro.sdk.developer.DeveloperHelper
285    @property
286    def developer_helper(self) -> DeveloperHelper:
287        return DeveloperHelper(self._client)