cirro.sdk.dataset

  1import datetime
  2import re
  3from pathlib import Path
  4from typing import Union, List, Optional, Any
  5
  6from cirro_api_client.v1.api.processes import validate_file_requirements
  7from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
  8    RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, ValidateFileRequirementsRequest
  9
 10from cirro.cirro_client import CirroApi
 11from cirro.file_utils import filter_files_by_pattern
 12from cirro.models.assets import DatasetAssets
 13from cirro.models.file import PathLike
 14from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 15from cirro.sdk.exceptions import DataPortalAssetNotFound
 16from cirro.sdk.exceptions import DataPortalInputError
 17from cirro.sdk.file import DataPortalFile, DataPortalFiles
 18from cirro.sdk.helpers import parse_process_name_or_id
 19from cirro.sdk.process import DataPortalProcess
 20
 21
 22def _pattern_to_captures_regex(pattern: str):
 23    """
 24    Convert a glob pattern that may contain ``{name}`` capture placeholders into
 25    a compiled regex and return ``(compiled_regex, capture_names)``.
 26
 27    Conversion rules:
 28      - ``{name}``  → named group matching a single path segment (no ``/``)
 29      - ``*``       → matches any characters within a single path segment
 30      - ``**``      → matches any characters including ``/`` (multiple segments)
 31      - All other characters are regex-escaped.
 32
 33    The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``):
 34    a pattern without a leading ``/`` will match at any depth in the path.
 35    """
 36    capture_names = re.findall(r'\{(\w+)\}', pattern)
 37    tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern)
 38    parts = []
 39    for token in tokens:
 40        if token == '**':
 41            parts.append('.*')
 42        elif token == '*':
 43            parts.append('[^/]*')
 44        elif re.match(r'^\{\w+\}$', token):
 45            name = token[1:-1]
 46            parts.append(f'(?P<{name}>[^/]+)')
 47        else:
 48            parts.append(re.escape(token))
 49    regex_str = ''.join(parts)
 50    if not pattern.startswith('/'):
 51        regex_str = r'(?:.+/)?' + regex_str
 52    return re.compile('^' + regex_str + '$'), capture_names
 53
 54
 55def _infer_file_format(path: str) -> str:
 56    """Infer the file format from the file extension."""
 57    path_lower = path.lower()
 58    for ext in ('.gz', '.bz2', '.xz', '.zst'):
 59        if path_lower.endswith(ext):
 60            path_lower = path_lower[:-len(ext)]
 61            break
 62    if path_lower.endswith('.csv') or path_lower.endswith('.tsv'):
 63        return 'csv'
 64    elif path_lower.endswith('.h5ad'):
 65        return 'h5ad'
 66    elif path_lower.endswith('.json'):
 67        return 'json'
 68    elif path_lower.endswith('.parquet'):
 69        return 'parquet'
 70    elif path_lower.endswith('.feather'):
 71        return 'feather'
 72    elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'):
 73        return 'pickle'
 74    elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'):
 75        return 'excel'
 76    else:
 77        return 'text'
 78
 79
 80def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any:
 81    """Read a file using the specified format, or auto-detect from extension."""
 82    if file_format is None:
 83        file_format = _infer_file_format(file.relative_path)
 84    if file_format == 'csv':
 85        return file.read_csv(**kwargs)
 86    elif file_format == 'h5ad':
 87        return file.read_h5ad()
 88    elif file_format == 'json':
 89        return file.read_json(**kwargs)
 90    elif file_format == 'parquet':
 91        return file.read_parquet(**kwargs)
 92    elif file_format == 'feather':
 93        return file.read_feather(**kwargs)
 94    elif file_format == 'pickle':
 95        return file.read_pickle(**kwargs)
 96    elif file_format == 'excel':
 97        return file.read_excel(**kwargs)
 98    elif file_format == 'text':
 99        return file.read(**kwargs)
100    elif file_format == 'bytes':
101        return file._get()
102    else:
103        raise DataPortalInputError(
104            f"Unsupported file_format: '{file_format}'. "
105            f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'"
106        )
107
108
109class DataPortalDataset(DataPortalAsset):
110    """
111    Datasets in the Data Portal are collections of files which have
112    either been uploaded directly, or which have been output by
113    an analysis pipeline or notebook.
114    """
115
116    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
117        """
118        Instantiate a dataset object
119
120        Should be invoked from a top-level constructor, for example:
121
122        ```python
123        from cirro import DataPortal
124        portal = DataPortal()
125        dataset = portal.get_dataset(
126            project="id-or-name-of-project",
127            dataset="id-or-name-of-dataset"
128        )
129        ```
130
131        """
132        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
133        self._data = dataset
134        self._assets: Optional[DatasetAssets] = None
135        self._client = client
136
137    @property
138    def id(self) -> str:
139        """Unique identifier for the dataset"""
140        return self._data.id
141
142    @property
143    def name(self) -> str:
144        """Editable name for the dataset"""
145        return self._data.name
146
147    @property
148    def description(self) -> str:
149        """Longer name for the dataset"""
150        return self._data.description
151
152    @property
153    def process_id(self) -> str:
154        """Unique ID of process used to create the dataset"""
155        return self._data.process_id
156
157    @property
158    def process(self) -> ProcessDetail:
159        """
160        Object representing the process used to create the dataset
161        """
162        return self._client.processes.get(self.process_id)
163
164    @property
165    def project_id(self) -> str:
166        """ID of the project containing the dataset"""
167        return self._data.project_id
168
169    @property
170    def status(self) -> Status:
171        """
172        Status of the dataset
173        """
174        return self._data.status
175
176    @property
177    def source_dataset_ids(self) -> List[str]:
178        """IDs of the datasets used as sources for this dataset (if any)"""
179        return self._data.source_dataset_ids
180
181    @property
182    def source_datasets(self) -> List['DataPortalDataset']:
183        """
184        Objects representing the datasets used as sources for this dataset (if any)
185        """
186        return [
187            DataPortalDataset(
188                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
189                client=self._client
190            )
191            for dataset_id in self.source_dataset_ids
192        ]
193
194    @property
195    def params(self) -> dict:
196        """
197        Parameters used to generate the dataset
198        """
199        return self._get_detail().params.to_dict()
200
201    @property
202    def info(self) -> dict:
203        """
204        Extra information about the dataset
205        """
206        return self._get_detail().info.to_dict()
207
208    @property
209    def tags(self) -> List[Tag]:
210        """
211        Tags applied to the dataset
212        """
213        return self._data.tags
214
215    @property
216    def share(self) -> Optional[NamedItem]:
217        """
218        Share associated with the dataset, if any.
219        """
220        return self._get_detail().share
221
222    @property
223    def created_by(self) -> str:
224        """User who created the dataset"""
225        return self._data.created_by
226
227    @property
228    def created_at(self) -> datetime.datetime:
229        """Timestamp of dataset creation"""
230        return self._data.created_at
231
232    def _get_detail(self):
233        if not isinstance(self._data, DatasetDetail):
234            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
235        return self._data
236
237    def _get_assets(self):
238        if not self._assets:
239            self._assets = self._client.datasets.get_assets_listing(
240                project_id=self.project_id,
241                dataset_id=self.id
242            )
243        return self._assets
244
245    def __str__(self):
246        return '\n'.join([
247            f"{i.title()}: {self.__getattribute__(i)}"
248            for i in ['name', 'id', 'description', 'status']
249        ])
250
251    def get_file(self, relative_path: str) -> DataPortalFile:
252        """
253        Get a file from the dataset using its relative path.
254
255        Args:
256            relative_path (str): Relative path of file within the dataset
257
258        Returns:
259            `from cirro.sdk.file import DataPortalFile`
260        """
261
262        # Get the list of files in this dataset
263        files = self.list_files()
264
265        # Try getting the file using the relative path provided by the user
266        try:
267            return files.get_by_id(relative_path)
268        except DataPortalAssetNotFound:
269            # Try getting the file with the 'data/' prefix prepended
270            try:
271                return files.get_by_id("data/" + relative_path)
272            except DataPortalAssetNotFound:
273                # If not found, raise the exception using the string provided
274                # by the user, not the data/ prepended version (which may be
275                # confusing to the user)
276                msg = '\n'.join([f"No file found with path '{relative_path}'."])
277                raise DataPortalAssetNotFound(msg)
278
279    def list_files(self, file_limit: int = 100000) -> DataPortalFiles:
280        """
281        Return the list of files which make up the dataset.
282
283        Args:
284            file_limit (int): Maximum number of files to return (default 100,000)
285        """
286        assets = self._client.datasets.get_assets_listing(
287            project_id=self.project_id,
288            dataset_id=self.id,
289            file_limit=file_limit
290        )
291        files = assets.files
292
293        return DataPortalFiles(
294            [
295                DataPortalFile(file=file, client=self._client)
296                for file in files
297            ]
298        )
299
300    def read_files(
301            self,
302            glob: str = None,
303            pattern: str = None,
304            filetype: str = None,
305            **kwargs
306    ):
307        """
308        Read the contents of files in the dataset.
309
310        See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
311        on ``glob``/``pattern`` matching and filetype options.
312
313        Args:
314            glob (str): Wildcard expression to match files.
315                Yields one item per matching file: the parsed content.
316            pattern (str): Wildcard expression with ``{name}`` capture
317                placeholders. Yields ``(content, meta)`` per matching file.
318            filetype (str): File format used to parse each file
319                (or ``None`` to infer from extension).
320            **kwargs: Additional keyword arguments forwarded to the
321                file-parsing function.
322
323        Yields:
324            - When using ``glob``: *content* for each matching file
325            - When using ``pattern``: ``(content, meta)`` for each matching file
326        """
327        if glob is not None and pattern is not None:
328            raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
329        if glob is None and pattern is None:
330            raise DataPortalInputError("Must specify either 'glob' or 'pattern'")
331
332        if glob is not None:
333            for file in filter_files_by_pattern(list(self.list_files()), glob):
334                yield _read_file_with_format(file, filetype, **kwargs)
335        else:
336            compiled_regex, _ = _pattern_to_captures_regex(pattern)
337            for file in self.list_files():
338                m = compiled_regex.match(file.relative_path)
339                if m is not None:
340                    yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()
341
342    def read_file(
343            self,
344            path: str = None,
345            glob: str = None,
346            filetype: str = None,
347            **kwargs
348    ) -> Any:
349        """
350        Read the contents of a single file from the dataset.
351
352        See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.
353
354        Args:
355            path (str): Exact relative path of the file within the dataset.
356            glob (str): Wildcard expression matching exactly one file.
357            filetype (str): File format used to parse the file. Supported values
358                are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
359            **kwargs: Additional keyword arguments forwarded to the file-parsing
360                function.
361
362        Returns:
363            Parsed file content.
364        """
365        if path is not None and glob is not None:
366            raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
367        if path is None and glob is None:
368            raise DataPortalInputError("Must specify either 'path' or 'glob'")
369
370        if path is not None:
371            file = self.get_file(path)
372        else:
373            matches = list(filter_files_by_pattern(list(self.list_files()), glob))
374            if len(matches) == 0:
375                raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
376            if len(matches) > 1:
377                raise DataPortalInputError(
378                    f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
379                )
380            file = matches[0]
381
382        return _read_file_with_format(file, filetype, **kwargs)
383
384    def get_trace(self) -> Any:
385        """
386        Read the Nextflow workflow trace file for this dataset as a DataFrame.
387
388        Returns:
389            `pandas.DataFrame`
390        """
391        return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')
392
393    def get_logs(self) -> str:
394        """
395        Read the Nextflow workflow logs for this dataset as a string.
396
397        Returns:
398            str
399        """
400        return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()
401
402    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
403        """
404        Get the artifact of a particular type from the dataset
405        """
406        artifacts = self._get_assets().artifacts
407        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
408        if artifact is None:
409            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
410        return DataPortalFile(file=artifact.file, client=self._client)
411
412    def list_artifacts(self) -> List[DataPortalFile]:
413        """
414        Return the list of artifacts associated with the dataset
415
416        An artifact may be something generated as part of the analysis or other process.
417        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
418
419        """
420        artifacts = self._get_assets().artifacts
421        return DataPortalFiles(
422            [
423                DataPortalFile(file=artifact.file, client=self._client)
424                for artifact in artifacts
425            ]
426        )
427
428    def download_files(self, download_location: str = None, glob: str = None) -> None:
429        """
430        Download all the files from the dataset to a local directory.
431
432        Args:
433            download_location (str): Path to local directory
434            glob (str): Optional wildcard expression to filter which files are downloaded
435                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
436                If omitted, all files are downloaded.
437        """
438
439        files = self.list_files()
440        if glob is not None:
441            files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
442        files.download(download_location)
443
444    def run_analysis(
445            self,
446            name: str = None,
447            description: str = "",
448            process: Union[DataPortalProcess, str] = None,
449            params=None,
450            notifications_emails: List[str] = None,
451            compute_environment: str = None,
452            resume_dataset_id: str = None,
453            source_sample_ids: List[str] = None
454    ) -> str:
455        """
456        Runs an analysis on a dataset, returns the ID of the newly created dataset.
457
458        The process can be provided as either a DataPortalProcess object,
459        or a string which corresponds to the name or ID of the process.
460
461        Args:
462            name (str): Name of newly created dataset
463            description (str): Description of newly created dataset
464            process (DataPortalProcess or str): Process to run
465            params (dict): Analysis parameters
466            notifications_emails (List[str]): Notification email address(es)
467            compute_environment (str): Name or ID of compute environment to use,
468             if blank it will run in AWS
469            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
470             It will attempt to re-use the previous output to minimize duplicate work
471            source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
472
473        Returns:
474            dataset_id (str): ID of newly created dataset
475        """
476        if name is None:
477            raise DataPortalInputError("Must specify 'name' for run_analysis")
478        if process is None:
479            raise DataPortalInputError("Must specify 'process' for run_analysis")
480        if notifications_emails is None:
481            notifications_emails = []
482        if params is None:
483            params = {}
484
485        # If the process is a string, try to parse it as a process name or ID
486        process = parse_process_name_or_id(process, self._client)
487
488        if compute_environment:
489            compute_environment_name = compute_environment
490            compute_environments = self._client.compute_environments.list_environments_for_project(
491                project_id=self.project_id
492            )
493            compute_environment = next(
494                (env for env in compute_environments
495                 if env.name == compute_environment or env.id == compute_environment),
496                None
497            )
498            if compute_environment is None:
499                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
500
501        resp = self._client.execution.run_analysis(
502            project_id=self.project_id,
503            request=RunAnalysisRequest(
504                name=name,
505                description=description,
506                process_id=process.id,
507                source_dataset_ids=[self.id],
508                params=RunAnalysisRequestParams.from_dict(params),
509                notification_emails=notifications_emails,
510                resume_dataset_id=resume_dataset_id,
511                source_sample_ids=source_sample_ids,
512                compute_environment_id=compute_environment.id if compute_environment else None
513            )
514        )
515        return resp.id
516
517    def update_samplesheet(self,
518                           contents: str = None,
519                           file_path: PathLike = None):
520        """
521        Updates the samplesheet metadata of a dataset.
522        Provide either the contents (as a string) or a file path.
523        Both must be in the format of a CSV.
524
525        Args:
526            contents (str): Samplesheet contents to update (should be a CSV string)
527            file_path (PathLike): Path of file to update (should be a CSV file)
528
529        Example:
530        ```python
531        dataset.update_samplesheet(
532            file_path=Path('~/samplesheet.csv')
533        )
534        ```
535        """
536
537        if contents is None and file_path is None:
538            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
539
540        samplesheet_contents = contents
541        if file_path is not None:
542            samplesheet_contents = Path(file_path).expanduser().read_text()
543
544        # Validate samplesheet
545        file_names = [f.file_name for f in self.list_files()]
546        request = ValidateFileRequirementsRequest(
547            file_names=file_names,
548            sample_sheet=samplesheet_contents,
549        )
550        requirements = validate_file_requirements.sync(process_id=self.process_id,
551                                                       body=request,
552                                                       client=self._client.api_client)
553        if error_msg := requirements.error_msg:
554            raise DataPortalInputError(error_msg)
555
556        # Update the samplesheet if everything looks ok
557        self._client.datasets.update_samplesheet(
558            project_id=self.project_id,
559            dataset_id=self.id,
560            samplesheet=samplesheet_contents
561        )
562
563
564class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
565    """Collection of multiple DataPortalDataset objects."""
566    asset_name = "dataset"
class DataPortalDataset(cirro.sdk.asset.DataPortalAsset):
110class DataPortalDataset(DataPortalAsset):
111    """
112    Datasets in the Data Portal are collections of files which have
113    either been uploaded directly, or which have been output by
114    an analysis pipeline or notebook.
115    """
116
117    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
118        """
119        Instantiate a dataset object
120
121        Should be invoked from a top-level constructor, for example:
122
123        ```python
124        from cirro import DataPortal
125        portal = DataPortal()
126        dataset = portal.get_dataset(
127            project="id-or-name-of-project",
128            dataset="id-or-name-of-dataset"
129        )
130        ```
131
132        """
133        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
134        self._data = dataset
135        self._assets: Optional[DatasetAssets] = None
136        self._client = client
137
138    @property
139    def id(self) -> str:
140        """Unique identifier for the dataset"""
141        return self._data.id
142
143    @property
144    def name(self) -> str:
145        """Editable name for the dataset"""
146        return self._data.name
147
148    @property
149    def description(self) -> str:
150        """Longer name for the dataset"""
151        return self._data.description
152
153    @property
154    def process_id(self) -> str:
155        """Unique ID of process used to create the dataset"""
156        return self._data.process_id
157
158    @property
159    def process(self) -> ProcessDetail:
160        """
161        Object representing the process used to create the dataset
162        """
163        return self._client.processes.get(self.process_id)
164
165    @property
166    def project_id(self) -> str:
167        """ID of the project containing the dataset"""
168        return self._data.project_id
169
170    @property
171    def status(self) -> Status:
172        """
173        Status of the dataset
174        """
175        return self._data.status
176
177    @property
178    def source_dataset_ids(self) -> List[str]:
179        """IDs of the datasets used as sources for this dataset (if any)"""
180        return self._data.source_dataset_ids
181
182    @property
183    def source_datasets(self) -> List['DataPortalDataset']:
184        """
185        Objects representing the datasets used as sources for this dataset (if any)
186        """
187        return [
188            DataPortalDataset(
189                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
190                client=self._client
191            )
192            for dataset_id in self.source_dataset_ids
193        ]
194
195    @property
196    def params(self) -> dict:
197        """
198        Parameters used to generate the dataset
199        """
200        return self._get_detail().params.to_dict()
201
202    @property
203    def info(self) -> dict:
204        """
205        Extra information about the dataset
206        """
207        return self._get_detail().info.to_dict()
208
209    @property
210    def tags(self) -> List[Tag]:
211        """
212        Tags applied to the dataset
213        """
214        return self._data.tags
215
216    @property
217    def share(self) -> Optional[NamedItem]:
218        """
219        Share associated with the dataset, if any.
220        """
221        return self._get_detail().share
222
223    @property
224    def created_by(self) -> str:
225        """User who created the dataset"""
226        return self._data.created_by
227
228    @property
229    def created_at(self) -> datetime.datetime:
230        """Timestamp of dataset creation"""
231        return self._data.created_at
232
233    def _get_detail(self):
234        if not isinstance(self._data, DatasetDetail):
235            self._data = self._client.datasets.get(project_id=self.project_id, dataset_id=self.id)
236        return self._data
237
238    def _get_assets(self):
239        if not self._assets:
240            self._assets = self._client.datasets.get_assets_listing(
241                project_id=self.project_id,
242                dataset_id=self.id
243            )
244        return self._assets
245
246    def __str__(self):
247        return '\n'.join([
248            f"{i.title()}: {self.__getattribute__(i)}"
249            for i in ['name', 'id', 'description', 'status']
250        ])
251
252    def get_file(self, relative_path: str) -> DataPortalFile:
253        """
254        Get a file from the dataset using its relative path.
255
256        Args:
257            relative_path (str): Relative path of file within the dataset
258
259        Returns:
260            `from cirro.sdk.file import DataPortalFile`
261        """
262
263        # Get the list of files in this dataset
264        files = self.list_files()
265
266        # Try getting the file using the relative path provided by the user
267        try:
268            return files.get_by_id(relative_path)
269        except DataPortalAssetNotFound:
270            # Try getting the file with the 'data/' prefix prepended
271            try:
272                return files.get_by_id("data/" + relative_path)
273            except DataPortalAssetNotFound:
274                # If not found, raise the exception using the string provided
275                # by the user, not the data/ prepended version (which may be
276                # confusing to the user)
277                msg = '\n'.join([f"No file found with path '{relative_path}'."])
278                raise DataPortalAssetNotFound(msg)
279
280    def list_files(self, file_limit: int = 100000) -> DataPortalFiles:
281        """
282        Return the list of files which make up the dataset.
283
284        Args:
285            file_limit (int): Maximum number of files to return (default 100,000)
286        """
287        assets = self._client.datasets.get_assets_listing(
288            project_id=self.project_id,
289            dataset_id=self.id,
290            file_limit=file_limit
291        )
292        files = assets.files
293
294        return DataPortalFiles(
295            [
296                DataPortalFile(file=file, client=self._client)
297                for file in files
298            ]
299        )
300
301    def read_files(
302            self,
303            glob: str = None,
304            pattern: str = None,
305            filetype: str = None,
306            **kwargs
307    ):
308        """
309        Read the contents of files in the dataset.
310
311        See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
312        on ``glob``/``pattern`` matching and filetype options.
313
314        Args:
315            glob (str): Wildcard expression to match files.
316                Yields one item per matching file: the parsed content.
317            pattern (str): Wildcard expression with ``{name}`` capture
318                placeholders. Yields ``(content, meta)`` per matching file.
319            filetype (str): File format used to parse each file
320                (or ``None`` to infer from extension).
321            **kwargs: Additional keyword arguments forwarded to the
322                file-parsing function.
323
324        Yields:
325            - When using ``glob``: *content* for each matching file
326            - When using ``pattern``: ``(content, meta)`` for each matching file
327        """
328        if glob is not None and pattern is not None:
329            raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
330        if glob is None and pattern is None:
331            raise DataPortalInputError("Must specify either 'glob' or 'pattern'")
332
333        if glob is not None:
334            for file in filter_files_by_pattern(list(self.list_files()), glob):
335                yield _read_file_with_format(file, filetype, **kwargs)
336        else:
337            compiled_regex, _ = _pattern_to_captures_regex(pattern)
338            for file in self.list_files():
339                m = compiled_regex.match(file.relative_path)
340                if m is not None:
341                    yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()
342
343    def read_file(
344            self,
345            path: str = None,
346            glob: str = None,
347            filetype: str = None,
348            **kwargs
349    ) -> Any:
350        """
351        Read the contents of a single file from the dataset.
352
353        See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.
354
355        Args:
356            path (str): Exact relative path of the file within the dataset.
357            glob (str): Wildcard expression matching exactly one file.
358            filetype (str): File format used to parse the file. Supported values
359                are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
360            **kwargs: Additional keyword arguments forwarded to the file-parsing
361                function.
362
363        Returns:
364            Parsed file content.
365        """
366        if path is not None and glob is not None:
367            raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
368        if path is None and glob is None:
369            raise DataPortalInputError("Must specify either 'path' or 'glob'")
370
371        if path is not None:
372            file = self.get_file(path)
373        else:
374            matches = list(filter_files_by_pattern(list(self.list_files()), glob))
375            if len(matches) == 0:
376                raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
377            if len(matches) > 1:
378                raise DataPortalInputError(
379                    f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
380                )
381            file = matches[0]
382
383        return _read_file_with_format(file, filetype, **kwargs)
384
385    def get_trace(self) -> Any:
386        """
387        Read the Nextflow workflow trace file for this dataset as a DataFrame.
388
389        Returns:
390            `pandas.DataFrame`
391        """
392        return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')
393
394    def get_logs(self) -> str:
395        """
396        Read the Nextflow workflow logs for this dataset as a string.
397
398        Returns:
399            str
400        """
401        return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()
402
403    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
404        """
405        Get the artifact of a particular type from the dataset
406        """
407        artifacts = self._get_assets().artifacts
408        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
409        if artifact is None:
410            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
411        return DataPortalFile(file=artifact.file, client=self._client)
412
413    def list_artifacts(self) -> List[DataPortalFile]:
414        """
415        Return the list of artifacts associated with the dataset
416
417        An artifact may be something generated as part of the analysis or other process.
418        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
419
420        """
421        artifacts = self._get_assets().artifacts
422        return DataPortalFiles(
423            [
424                DataPortalFile(file=artifact.file, client=self._client)
425                for artifact in artifacts
426            ]
427        )
428
429    def download_files(self, download_location: str = None, glob: str = None) -> None:
430        """
431        Download all the files from the dataset to a local directory.
432
433        Args:
434            download_location (str): Path to local directory
435            glob (str): Optional wildcard expression to filter which files are downloaded
436                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
437                If omitted, all files are downloaded.
438        """
439
440        files = self.list_files()
441        if glob is not None:
442            files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
443        files.download(download_location)
444
445    def run_analysis(
446            self,
447            name: str = None,
448            description: str = "",
449            process: Union[DataPortalProcess, str] = None,
450            params=None,
451            notifications_emails: List[str] = None,
452            compute_environment: str = None,
453            resume_dataset_id: str = None,
454            source_sample_ids: List[str] = None
455    ) -> str:
456        """
457        Runs an analysis on a dataset, returns the ID of the newly created dataset.
458
459        The process can be provided as either a DataPortalProcess object,
460        or a string which corresponds to the name or ID of the process.
461
462        Args:
463            name (str): Name of newly created dataset
464            description (str): Description of newly created dataset
465            process (DataPortalProcess or str): Process to run
466            params (dict): Analysis parameters
467            notifications_emails (List[str]): Notification email address(es)
468            compute_environment (str): Name or ID of compute environment to use,
469             if blank it will run in AWS
470            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
471             It will attempt to re-use the previous output to minimize duplicate work
472            source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
473
474        Returns:
475            dataset_id (str): ID of newly created dataset
476        """
477        if name is None:
478            raise DataPortalInputError("Must specify 'name' for run_analysis")
479        if process is None:
480            raise DataPortalInputError("Must specify 'process' for run_analysis")
481        if notifications_emails is None:
482            notifications_emails = []
483        if params is None:
484            params = {}
485
486        # If the process is a string, try to parse it as a process name or ID
487        process = parse_process_name_or_id(process, self._client)
488
489        if compute_environment:
490            compute_environment_name = compute_environment
491            compute_environments = self._client.compute_environments.list_environments_for_project(
492                project_id=self.project_id
493            )
494            compute_environment = next(
495                (env for env in compute_environments
496                 if env.name == compute_environment or env.id == compute_environment),
497                None
498            )
499            if compute_environment is None:
500                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
501
502        resp = self._client.execution.run_analysis(
503            project_id=self.project_id,
504            request=RunAnalysisRequest(
505                name=name,
506                description=description,
507                process_id=process.id,
508                source_dataset_ids=[self.id],
509                params=RunAnalysisRequestParams.from_dict(params),
510                notification_emails=notifications_emails,
511                resume_dataset_id=resume_dataset_id,
512                source_sample_ids=source_sample_ids,
513                compute_environment_id=compute_environment.id if compute_environment else None
514            )
515        )
516        return resp.id
517
518    def update_samplesheet(self,
519                           contents: str = None,
520                           file_path: PathLike = None):
521        """
522        Updates the samplesheet metadata of a dataset.
523        Provide either the contents (as a string) or a file path.
524        Both must be in the format of a CSV.
525
526        Args:
527            contents (str): Samplesheet contents to update (should be a CSV string)
528            file_path (PathLike): Path of file to update (should be a CSV file)
529
530        Example:
531        ```python
532        dataset.update_samplesheet(
533            file_path=Path('~/samplesheet.csv')
534        )
535        ```
536        """
537
538        if contents is None and file_path is None:
539            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
540
541        samplesheet_contents = contents
542        if file_path is not None:
543            samplesheet_contents = Path(file_path).expanduser().read_text()
544
545        # Validate samplesheet
546        file_names = [f.file_name for f in self.list_files()]
547        request = ValidateFileRequirementsRequest(
548            file_names=file_names,
549            sample_sheet=samplesheet_contents,
550        )
551        requirements = validate_file_requirements.sync(process_id=self.process_id,
552                                                       body=request,
553                                                       client=self._client.api_client)
554        if error_msg := requirements.error_msg:
555            raise DataPortalInputError(error_msg)
556
557        # Update the samplesheet if everything looks ok
558        self._client.datasets.update_samplesheet(
559            project_id=self.project_id,
560            dataset_id=self.id,
561            samplesheet=samplesheet_contents
562        )

Datasets in the Data Portal are collections of files which have either been uploaded directly, or which have been output by an analysis pipeline or notebook.

DataPortalDataset( dataset: Union[cirro_api_client.v1.models.Dataset, cirro_api_client.v1.models.DatasetDetail], client: cirro.CirroApi)
117    def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
118        """
119        Instantiate a dataset object
120
121        Should be invoked from a top-level constructor, for example:
122
123        ```python
124        from cirro import DataPortal
125        portal = DataPortal()
126        dataset = portal.get_dataset(
127            project="id-or-name-of-project",
128            dataset="id-or-name-of-dataset"
129        )
130        ```
131
132        """
133        assert dataset.project_id is not None, "Must provide dataset with project_id attribute"
134        self._data = dataset
135        self._assets: Optional[DatasetAssets] = None
136        self._client = client

Instantiate a dataset object

Should be invoked from a top-level constructor, for example:

from cirro import DataPortal
portal = DataPortal()
dataset = portal.get_dataset(
    project="id-or-name-of-project",
    dataset="id-or-name-of-dataset"
)
id: str
138    @property
139    def id(self) -> str:
140        """Unique identifier for the dataset"""
141        return self._data.id

Unique identifier for the dataset

name: str
143    @property
144    def name(self) -> str:
145        """Editable name for the dataset"""
146        return self._data.name

Editable name for the dataset

description: str
148    @property
149    def description(self) -> str:
150        """Longer name for the dataset"""
151        return self._data.description

Longer name for the dataset

process_id: str
153    @property
154    def process_id(self) -> str:
155        """Unique ID of process used to create the dataset"""
156        return self._data.process_id

Unique ID of process used to create the dataset

158    @property
159    def process(self) -> ProcessDetail:
160        """
161        Object representing the process used to create the dataset
162        """
163        return self._client.processes.get(self.process_id)

Object representing the process used to create the dataset

project_id: str
165    @property
166    def project_id(self) -> str:
167        """ID of the project containing the dataset"""
168        return self._data.project_id

ID of the project containing the dataset

status: cirro_api_client.v1.models.Status
170    @property
171    def status(self) -> Status:
172        """
173        Status of the dataset
174        """
175        return self._data.status

Status of the dataset

source_dataset_ids: List[str]
177    @property
178    def source_dataset_ids(self) -> List[str]:
179        """IDs of the datasets used as sources for this dataset (if any)"""
180        return self._data.source_dataset_ids

IDs of the datasets used as sources for this dataset (if any)

source_datasets: List[DataPortalDataset]
182    @property
183    def source_datasets(self) -> List['DataPortalDataset']:
184        """
185        Objects representing the datasets used as sources for this dataset (if any)
186        """
187        return [
188            DataPortalDataset(
189                dataset=self._client.datasets.get(project_id=self.project_id, dataset_id=dataset_id),
190                client=self._client
191            )
192            for dataset_id in self.source_dataset_ids
193        ]

Objects representing the datasets used as sources for this dataset (if any)

params: dict
195    @property
196    def params(self) -> dict:
197        """
198        Parameters used to generate the dataset
199        """
200        return self._get_detail().params.to_dict()

Parameters used to generate the dataset

info: dict
202    @property
203    def info(self) -> dict:
204        """
205        Extra information about the dataset
206        """
207        return self._get_detail().info.to_dict()

Extra information about the dataset

tags: List[cirro_api_client.v1.models.Tag]
209    @property
210    def tags(self) -> List[Tag]:
211        """
212        Tags applied to the dataset
213        """
214        return self._data.tags

Tags applied to the dataset

share: Optional[cirro_api_client.v1.models.NamedItem]
216    @property
217    def share(self) -> Optional[NamedItem]:
218        """
219        Share associated with the dataset, if any.
220        """
221        return self._get_detail().share

Share associated with the dataset, if any.

created_by: str
223    @property
224    def created_by(self) -> str:
225        """User who created the dataset"""
226        return self._data.created_by

User who created the dataset

created_at: datetime.datetime
228    @property
229    def created_at(self) -> datetime.datetime:
230        """Timestamp of dataset creation"""
231        return self._data.created_at

Timestamp of dataset creation

def get_file(self, relative_path: str) -> cirro.sdk.file.DataPortalFile:
252    def get_file(self, relative_path: str) -> DataPortalFile:
253        """
254        Get a file from the dataset using its relative path.
255
256        Args:
257            relative_path (str): Relative path of file within the dataset
258
259        Returns:
260            `from cirro.sdk.file import DataPortalFile`
261        """
262
263        # Get the list of files in this dataset
264        files = self.list_files()
265
266        # Try getting the file using the relative path provided by the user
267        try:
268            return files.get_by_id(relative_path)
269        except DataPortalAssetNotFound:
270            # Try getting the file with the 'data/' prefix prepended
271            try:
272                return files.get_by_id("data/" + relative_path)
273            except DataPortalAssetNotFound:
274                # If not found, raise the exception using the string provided
275                # by the user, not the data/ prepended version (which may be
276                # confusing to the user)
277                msg = '\n'.join([f"No file found with path '{relative_path}'."])
278                raise DataPortalAssetNotFound(msg)

Get a file from the dataset using its relative path.

Arguments:
  • relative_path (str): Relative path of file within the dataset
Returns:

from cirro.sdk.file import DataPortalFile

def list_files(self, file_limit: int = 100000) -> cirro.sdk.file.DataPortalFiles:
280    def list_files(self, file_limit: int = 100000) -> DataPortalFiles:
281        """
282        Return the list of files which make up the dataset.
283
284        Args:
285            file_limit (int): Maximum number of files to return (default 100,000)
286        """
287        assets = self._client.datasets.get_assets_listing(
288            project_id=self.project_id,
289            dataset_id=self.id,
290            file_limit=file_limit
291        )
292        files = assets.files
293
294        return DataPortalFiles(
295            [
296                DataPortalFile(file=file, client=self._client)
297                for file in files
298            ]
299        )

Return the list of files which make up the dataset.

Arguments:
  • file_limit (int): Maximum number of files to return (default 100,000)
def read_files( self, glob: str = None, pattern: str = None, filetype: str = None, **kwargs):
301    def read_files(
302            self,
303            glob: str = None,
304            pattern: str = None,
305            filetype: str = None,
306            **kwargs
307    ):
308        """
309        Read the contents of files in the dataset.
310
311        See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
312        on ``glob``/``pattern`` matching and filetype options.
313
314        Args:
315            glob (str): Wildcard expression to match files.
316                Yields one item per matching file: the parsed content.
317            pattern (str): Wildcard expression with ``{name}`` capture
318                placeholders. Yields ``(content, meta)`` per matching file.
319            filetype (str): File format used to parse each file
320                (or ``None`` to infer from extension).
321            **kwargs: Additional keyword arguments forwarded to the
322                file-parsing function.
323
324        Yields:
325            - When using ``glob``: *content* for each matching file
326            - When using ``pattern``: ``(content, meta)`` for each matching file
327        """
328        if glob is not None and pattern is not None:
329            raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
330        if glob is None and pattern is None:
331            raise DataPortalInputError("Must specify either 'glob' or 'pattern'")
332
333        if glob is not None:
334            for file in filter_files_by_pattern(list(self.list_files()), glob):
335                yield _read_file_with_format(file, filetype, **kwargs)
336        else:
337            compiled_regex, _ = _pattern_to_captures_regex(pattern)
338            for file in self.list_files():
339                m = compiled_regex.match(file.relative_path)
340                if m is not None:
341                    yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()

Read the contents of files in the dataset.

See ~cirro.sdk.portal.DataPortal.read_files() for full details on glob/pattern matching and filetype options.

Arguments:
  • glob (str): Wildcard expression to match files. Yields one item per matching file: the parsed content.
  • pattern (str): Wildcard expression with {name} capture placeholders. Yields (content, meta) per matching file.
  • filetype (str): File format used to parse each file (or None to infer from extension).
  • **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Yields:
  • When using glob: content for each matching file
  • When using pattern: (content, meta) for each matching file
def read_file( self, path: str = None, glob: str = None, filetype: str = None, **kwargs) -> Any:
343    def read_file(
344            self,
345            path: str = None,
346            glob: str = None,
347            filetype: str = None,
348            **kwargs
349    ) -> Any:
350        """
351        Read the contents of a single file from the dataset.
352
353        See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.
354
355        Args:
356            path (str): Exact relative path of the file within the dataset.
357            glob (str): Wildcard expression matching exactly one file.
358            filetype (str): File format used to parse the file. Supported values
359                are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
360            **kwargs: Additional keyword arguments forwarded to the file-parsing
361                function.
362
363        Returns:
364            Parsed file content.
365        """
366        if path is not None and glob is not None:
367            raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
368        if path is None and glob is None:
369            raise DataPortalInputError("Must specify either 'path' or 'glob'")
370
371        if path is not None:
372            file = self.get_file(path)
373        else:
374            matches = list(filter_files_by_pattern(list(self.list_files()), glob))
375            if len(matches) == 0:
376                raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
377            if len(matches) > 1:
378                raise DataPortalInputError(
379                    f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
380                )
381            file = matches[0]
382
383        return _read_file_with_format(file, filetype, **kwargs)

Read the contents of a single file from the dataset.

See ~cirro.sdk.portal.DataPortal.read_file() for full details.

Arguments:
  • path (str): Exact relative path of the file within the dataset.
  • glob (str): Wildcard expression matching exactly one file.
  • filetype (str): File format used to parse the file. Supported values are the same as ~cirro.sdk.portal.DataPortal.read_files().
  • **kwargs: Additional keyword arguments forwarded to the file-parsing function.
Returns:

Parsed file content.

def get_trace(self) -> Any:
385    def get_trace(self) -> Any:
386        """
387        Read the Nextflow workflow trace file for this dataset as a DataFrame.
388
389        Returns:
390            `pandas.DataFrame`
391        """
392        return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')

Read the Nextflow workflow trace file for this dataset as a DataFrame.

Returns:

pandas.DataFrame

def get_logs(self) -> str:
394    def get_logs(self) -> str:
395        """
396        Read the Nextflow workflow logs for this dataset as a string.
397
398        Returns:
399            str
400        """
401        return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()

Read the Nextflow workflow logs for this dataset as a string.

Returns:

str

def get_artifact( self, artifact_type: cirro_api_client.v1.models.ArtifactType) -> cirro.sdk.file.DataPortalFile:
403    def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
404        """
405        Get the artifact of a particular type from the dataset
406        """
407        artifacts = self._get_assets().artifacts
408        artifact = next((a for a in artifacts if a.artifact_type == artifact_type), None)
409        if artifact is None:
410            raise DataPortalAssetNotFound(f"No artifact found with type '{artifact_type}'")
411        return DataPortalFile(file=artifact.file, client=self._client)

Get the artifact of a particular type from the dataset

def list_artifacts(self) -> List[cirro.sdk.file.DataPortalFile]:
413    def list_artifacts(self) -> List[DataPortalFile]:
414        """
415        Return the list of artifacts associated with the dataset
416
417        An artifact may be something generated as part of the analysis or other process.
418        See `cirro_api_client.v1.models.ArtifactType` for the list of possible artifact types.
419
420        """
421        artifacts = self._get_assets().artifacts
422        return DataPortalFiles(
423            [
424                DataPortalFile(file=artifact.file, client=self._client)
425                for artifact in artifacts
426            ]
427        )

Return the list of artifacts associated with the dataset

An artifact may be something generated as part of the analysis or other process. See cirro_api_client.v1.models.ArtifactType for the list of possible artifact types.

def download_files(self, download_location: str = None, glob: str = None) -> None:
429    def download_files(self, download_location: str = None, glob: str = None) -> None:
430        """
431        Download all the files from the dataset to a local directory.
432
433        Args:
434            download_location (str): Path to local directory
435            glob (str): Optional wildcard expression to filter which files are downloaded
436                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
437                If omitted, all files are downloaded.
438        """
439
440        files = self.list_files()
441        if glob is not None:
442            files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
443        files.download(download_location)

Download all the files from the dataset to a local directory.

Arguments:
  • download_location (str): Path to local directory
  • glob (str): Optional wildcard expression to filter which files are downloaded (e.g., '*.csv', 'data/**/*.tsv.gz'). If omitted, all files are downloaded.
def run_analysis( self, name: str = None, description: str = '', process: Union[cirro.DataPortalProcess, str] = None, params=None, notifications_emails: List[str] = None, compute_environment: str = None, resume_dataset_id: str = None, source_sample_ids: List[str] = None) -> str:
445    def run_analysis(
446            self,
447            name: str = None,
448            description: str = "",
449            process: Union[DataPortalProcess, str] = None,
450            params=None,
451            notifications_emails: List[str] = None,
452            compute_environment: str = None,
453            resume_dataset_id: str = None,
454            source_sample_ids: List[str] = None
455    ) -> str:
456        """
457        Runs an analysis on a dataset, returns the ID of the newly created dataset.
458
459        The process can be provided as either a DataPortalProcess object,
460        or a string which corresponds to the name or ID of the process.
461
462        Args:
463            name (str): Name of newly created dataset
464            description (str): Description of newly created dataset
465            process (DataPortalProcess or str): Process to run
466            params (dict): Analysis parameters
467            notifications_emails (List[str]): Notification email address(es)
468            compute_environment (str): Name or ID of compute environment to use,
469             if blank it will run in AWS
470            resume_dataset_id (str): ID of dataset to resume from, used for caching task execution.
471             It will attempt to re-use the previous output to minimize duplicate work
472            source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
473
474        Returns:
475            dataset_id (str): ID of newly created dataset
476        """
477        if name is None:
478            raise DataPortalInputError("Must specify 'name' for run_analysis")
479        if process is None:
480            raise DataPortalInputError("Must specify 'process' for run_analysis")
481        if notifications_emails is None:
482            notifications_emails = []
483        if params is None:
484            params = {}
485
486        # If the process is a string, try to parse it as a process name or ID
487        process = parse_process_name_or_id(process, self._client)
488
489        if compute_environment:
490            compute_environment_name = compute_environment
491            compute_environments = self._client.compute_environments.list_environments_for_project(
492                project_id=self.project_id
493            )
494            compute_environment = next(
495                (env for env in compute_environments
496                 if env.name == compute_environment or env.id == compute_environment),
497                None
498            )
499            if compute_environment is None:
500                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
501
502        resp = self._client.execution.run_analysis(
503            project_id=self.project_id,
504            request=RunAnalysisRequest(
505                name=name,
506                description=description,
507                process_id=process.id,
508                source_dataset_ids=[self.id],
509                params=RunAnalysisRequestParams.from_dict(params),
510                notification_emails=notifications_emails,
511                resume_dataset_id=resume_dataset_id,
512                source_sample_ids=source_sample_ids,
513                compute_environment_id=compute_environment.id if compute_environment else None
514            )
515        )
516        return resp.id

Runs an analysis on a dataset, returns the ID of the newly created dataset.

The process can be provided as either a DataPortalProcess object, or a string which corresponds to the name or ID of the process.

Arguments:
  • name (str): Name of newly created dataset
  • description (str): Description of newly created dataset
  • process (DataPortalProcess or str): Process to run
  • params (dict): Analysis parameters
  • notifications_emails (List[str]): Notification email address(es)
  • compute_environment (str): Name or ID of compute environment to use, if blank it will run in AWS
  • resume_dataset_id (str): ID of dataset to resume from, used for caching task execution. It will attempt to re-use the previous output to minimize duplicate work
  • source_sample_ids (List[str]): List of sample IDs to use as input for the analysis.
Returns:

dataset_id (str): ID of newly created dataset

def update_samplesheet(self, contents: str = None, file_path: ~PathLike = None):
518    def update_samplesheet(self,
519                           contents: str = None,
520                           file_path: PathLike = None):
521        """
522        Updates the samplesheet metadata of a dataset.
523        Provide either the contents (as a string) or a file path.
524        Both must be in the format of a CSV.
525
526        Args:
527            contents (str): Samplesheet contents to update (should be a CSV string)
528            file_path (PathLike): Path of file to update (should be a CSV file)
529
530        Example:
531        ```python
532        dataset.update_samplesheet(
533            file_path=Path('~/samplesheet.csv')
534        )
535        ```
536        """
537
538        if contents is None and file_path is None:
539            raise DataPortalInputError("Must specify either 'contents' or 'file_path' when updating samplesheet")
540
541        samplesheet_contents = contents
542        if file_path is not None:
543            samplesheet_contents = Path(file_path).expanduser().read_text()
544
545        # Validate samplesheet
546        file_names = [f.file_name for f in self.list_files()]
547        request = ValidateFileRequirementsRequest(
548            file_names=file_names,
549            sample_sheet=samplesheet_contents,
550        )
551        requirements = validate_file_requirements.sync(process_id=self.process_id,
552                                                       body=request,
553                                                       client=self._client.api_client)
554        if error_msg := requirements.error_msg:
555            raise DataPortalInputError(error_msg)
556
557        # Update the samplesheet if everything looks ok
558        self._client.datasets.update_samplesheet(
559            project_id=self.project_id,
560            dataset_id=self.id,
561            samplesheet=samplesheet_contents
562        )

Updates the samplesheet metadata of a dataset. Provide either the contents (as a string) or a file path. Both must be in the format of a CSV.

Arguments:
  • contents (str): Samplesheet contents to update (should be a CSV string)
  • file_path (PathLike): Path of file to update (should be a CSV file)

Example:

dataset.update_samplesheet(
    file_path=Path('~/samplesheet.csv')
)
565class DataPortalDatasets(DataPortalAssets[DataPortalDataset]):
566    """Collection of multiple DataPortalDataset objects."""
567    asset_name = "dataset"

Collection of multiple DataPortalDataset objects.

asset_name = 'dataset'