cirro.sdk.developer

View Source

  1from io import StringIO
  2
  3from cirro_api_client.v1.api.datasets import get_sample_sheets, ingest_samples
  4from cirro_api_client.v1.api.processes import validate_file_name_patterns
  5from cirro_api_client.v1.models import SampleSheets, ValidateFileNamePatternsRequest, FileNameMatch
  6
  7from cirro.cirro_client import CirroApi
  8from cirro.helpers import PreprocessDataset
  9
 10
 11class Matches(list[FileNameMatch]):
 12    def print(self):
 13        """
 14        Prints the file name validation matches in a readable format.
 15        """
 16        print(f'Matches: {len(self)}')
 17        print()
 18        for match in self:
 19            print(f'{match.file_name}')
 20            print(f'Sample name: {match.sample_name}')
 21            print(f'Matched regex: {match.regex_pattern_match}')
 22            print()
 23
 24
 25class DeveloperHelper:
 26    """
 27    Helper class for developer-related tasks,
 28    such as adding samplesheet preprocessing for a pipeline
 29    or testing file name validation and sample autopopulation.
 30    """
 31
 32    def __init__(self, client: CirroApi):
 33        self.client = client
 34
 35    def generate_preprocess_for_input_datasets(self,
 36                                               project_id: str,
 37                                               input_dataset_ids: list[str],
 38                                               params=None) -> PreprocessDataset:
 39        """
 40        Generates a PreprocessDataset object for the given datasets
 41
 42        With optional parameters to pass into the preprocess script.
 43        `metadata` is not available in this context, so it is mocked.
 44        """
 45        samplesheets = self._generate_samplesheets_for_datasets(project_id, input_dataset_ids)
 46        return PreprocessDataset(
 47            samplesheet=samplesheets.samples,
 48            files=samplesheets.files,
 49            params=params or {},
 50            # Mock metadata
 51            metadata={
 52                'dataset': {},
 53                'project': {},
 54                'inputs': [],
 55                'process': {}
 56            }
 57        )
 58
 59    def test_file_name_validation_for_dataset(self,
 60                                              project_id: str,
 61                                              dataset_id: str,
 62                                              file_name_patterns: list[str]) -> Matches:
 63        """
 64        Tests the file name validation for a given dataset against specified regex patterns.
 65
 66        Used when configuring Cirro's sample autopopulation feature.
 67        More info: https://docs.cirro.bio/features/samples/#using-auto-population
 68        """
 69        dataset_files = self.client.datasets.get_assets_listing(project_id=project_id, dataset_id=dataset_id).files
 70        file_names = [file.relative_path for file in dataset_files]
 71        return self.test_file_name_validation(file_names, file_name_patterns)
 72
 73    def test_file_name_validation(self,
 74                                  file_names: list[str],
 75                                  file_name_patterns: list[str]) -> Matches:
 76        """
 77        Tests the file name validation for a list of file names against specified regex patterns.
 78        """
 79        request_body = ValidateFileNamePatternsRequest(
 80            file_names=file_names,
 81            file_name_patterns=file_name_patterns
 82        )
 83
 84        matches = validate_file_name_patterns.sync(
 85            process_id="test",
 86            body=request_body,
 87            client=self.client.api_client
 88        )
 89        return Matches(matches)
 90
 91    def generate_samplesheets_for_dataset(self, project_id: str, dataset_id: str) -> SampleSheets:
 92        """
 93        Generates Cirro samplesheets for a given dataset
 94        """
 95        return get_sample_sheets.sync(
 96            project_id=project_id,
 97            dataset_id=dataset_id,
 98            client=self.client.api_client
 99        )
100
101    def rerun_sample_ingest_for_dataset(self, project_id: str, dataset_id: str):
102        """
103        Reruns the sample ingest process for a given dataset.
104        You'll want to do this if you have updated the file name patterns in your pipeline (or data type)
105        """
106        ingest_samples.sync_detailed(
107            project_id=project_id,
108            dataset_id=dataset_id,
109            client=self.client.api_client
110        )
111
112    def _generate_samplesheets_for_datasets(self, project_id: str, dataset_ids: list[str]) -> SampleSheets:
113        """
114        Generates Cirro samplesheets for multiple datasets in a project.
115        """
116        # Concatenate samplesheets using pandas
117        import pandas
118        samplesheets_dfs = []
119        files_dfs = []
120        for dataset_id in dataset_ids:
121            samplesheet = self.generate_samplesheets_for_dataset(project_id, dataset_id)
122            samplesheets_dfs.append(pandas.read_csv(StringIO(samplesheet.samples)))
123            files_dfs.append(pandas.read_csv(StringIO(samplesheet.files)))
124
125        samplesheets_df = pandas.concat(samplesheets_dfs, ignore_index=True)
126        files_df = pandas.concat(files_dfs, ignore_index=True)
127        return SampleSheets(
128            samples=samplesheets_df.to_csv(index=False),
129            files=files_df.to_csv(index=False)
130        )

class Matches(list[cirro_api_client.v1.models.file_name_match.FileNameMatch]): View Source

12class Matches(list[FileNameMatch]):
13    def print(self):
14        """
15        Prints the file name validation matches in a readable format.
16        """
17        print(f'Matches: {len(self)}')
18        print()
19        for match in self:
20            print(f'{match.file_name}')
21            print(f'Sample name: {match.sample_name}')
22            print(f'Matched regex: {match.regex_pattern_match}')
23            print()

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

def print(self): View Source

13    def print(self):
14        """
15        Prints the file name validation matches in a readable format.
16        """
17        print(f'Matches: {len(self)}')
18        print()
19        for match in self:
20            print(f'{match.file_name}')
21            print(f'Sample name: {match.sample_name}')
22            print(f'Matched regex: {match.regex_pattern_match}')
23            print()

Prints the file name validation matches in a readable format.

class DeveloperHelper: View Source

 26class DeveloperHelper:
 27    """
 28    Helper class for developer-related tasks,
 29    such as adding samplesheet preprocessing for a pipeline
 30    or testing file name validation and sample autopopulation.
 31    """
 32
 33    def __init__(self, client: CirroApi):
 34        self.client = client
 35
 36    def generate_preprocess_for_input_datasets(self,
 37                                               project_id: str,
 38                                               input_dataset_ids: list[str],
 39                                               params=None) -> PreprocessDataset:
 40        """
 41        Generates a PreprocessDataset object for the given datasets
 42
 43        With optional parameters to pass into the preprocess script.
 44        `metadata` is not available in this context, so it is mocked.
 45        """
 46        samplesheets = self._generate_samplesheets_for_datasets(project_id, input_dataset_ids)
 47        return PreprocessDataset(
 48            samplesheet=samplesheets.samples,
 49            files=samplesheets.files,
 50            params=params or {},
 51            # Mock metadata
 52            metadata={
 53                'dataset': {},
 54                'project': {},
 55                'inputs': [],
 56                'process': {}
 57            }
 58        )
 59
 60    def test_file_name_validation_for_dataset(self,
 61                                              project_id: str,
 62                                              dataset_id: str,
 63                                              file_name_patterns: list[str]) -> Matches:
 64        """
 65        Tests the file name validation for a given dataset against specified regex patterns.
 66
 67        Used when configuring Cirro's sample autopopulation feature.
 68        More info: https://docs.cirro.bio/features/samples/#using-auto-population
 69        """
 70        dataset_files = self.client.datasets.get_assets_listing(project_id=project_id, dataset_id=dataset_id).files
 71        file_names = [file.relative_path for file in dataset_files]
 72        return self.test_file_name_validation(file_names, file_name_patterns)
 73
 74    def test_file_name_validation(self,
 75                                  file_names: list[str],
 76                                  file_name_patterns: list[str]) -> Matches:
 77        """
 78        Tests the file name validation for a list of file names against specified regex patterns.
 79        """
 80        request_body = ValidateFileNamePatternsRequest(
 81            file_names=file_names,
 82            file_name_patterns=file_name_patterns
 83        )
 84
 85        matches = validate_file_name_patterns.sync(
 86            process_id="test",
 87            body=request_body,
 88            client=self.client.api_client
 89        )
 90        return Matches(matches)
 91
 92    def generate_samplesheets_for_dataset(self, project_id: str, dataset_id: str) -> SampleSheets:
 93        """
 94        Generates Cirro samplesheets for a given dataset
 95        """
 96        return get_sample_sheets.sync(
 97            project_id=project_id,
 98            dataset_id=dataset_id,
 99            client=self.client.api_client
100        )
101
102    def rerun_sample_ingest_for_dataset(self, project_id: str, dataset_id: str):
103        """
104        Reruns the sample ingest process for a given dataset.
105        You'll want to do this if you have updated the file name patterns in your pipeline (or data type)
106        """
107        ingest_samples.sync_detailed(
108            project_id=project_id,
109            dataset_id=dataset_id,
110            client=self.client.api_client
111        )
112
113    def _generate_samplesheets_for_datasets(self, project_id: str, dataset_ids: list[str]) -> SampleSheets:
114        """
115        Generates Cirro samplesheets for multiple datasets in a project.
116        """
117        # Concatenate samplesheets using pandas
118        import pandas
119        samplesheets_dfs = []
120        files_dfs = []
121        for dataset_id in dataset_ids:
122            samplesheet = self.generate_samplesheets_for_dataset(project_id, dataset_id)
123            samplesheets_dfs.append(pandas.read_csv(StringIO(samplesheet.samples)))
124            files_dfs.append(pandas.read_csv(StringIO(samplesheet.files)))
125
126        samplesheets_df = pandas.concat(samplesheets_dfs, ignore_index=True)
127        files_df = pandas.concat(files_dfs, ignore_index=True)
128        return SampleSheets(
129            samples=samplesheets_df.to_csv(index=False),
130            files=files_df.to_csv(index=False)
131        )

Helper class for developer-related tasks, such as adding samplesheet preprocessing for a pipeline or testing file name validation and sample autopopulation.

DeveloperHelper(client: cirro.CirroApi) View Source

33    def __init__(self, client: CirroApi):
34        self.client = client

client

def generate_preprocess_for_input_datasets( self, project_id: str, input_dataset_ids: list[str], params=None) -> cirro.helpers.preprocess_dataset.PreprocessDataset: View Source

36    def generate_preprocess_for_input_datasets(self,
37                                               project_id: str,
38                                               input_dataset_ids: list[str],
39                                               params=None) -> PreprocessDataset:
40        """
41        Generates a PreprocessDataset object for the given datasets
42
43        With optional parameters to pass into the preprocess script.
44        `metadata` is not available in this context, so it is mocked.
45        """
46        samplesheets = self._generate_samplesheets_for_datasets(project_id, input_dataset_ids)
47        return PreprocessDataset(
48            samplesheet=samplesheets.samples,
49            files=samplesheets.files,
50            params=params or {},
51            # Mock metadata
52            metadata={
53                'dataset': {},
54                'project': {},
55                'inputs': [],
56                'process': {}
57            }
58        )

Generates a PreprocessDataset object for the given datasets

With optional parameters to pass into the preprocess script. metadata is not available in this context, so it is mocked.

def test_file_name_validation_for_dataset( self, project_id: str, dataset_id: str, file_name_patterns: list[str]) -> Matches: View Source

60    def test_file_name_validation_for_dataset(self,
61                                              project_id: str,
62                                              dataset_id: str,
63                                              file_name_patterns: list[str]) -> Matches:
64        """
65        Tests the file name validation for a given dataset against specified regex patterns.
66
67        Used when configuring Cirro's sample autopopulation feature.
68        More info: https://docs.cirro.bio/features/samples/#using-auto-population
69        """
70        dataset_files = self.client.datasets.get_assets_listing(project_id=project_id, dataset_id=dataset_id).files
71        file_names = [file.relative_path for file in dataset_files]
72        return self.test_file_name_validation(file_names, file_name_patterns)

Tests the file name validation for a given dataset against specified regex patterns.

Used when configuring Cirro's sample autopopulation feature. More info: https://docs.cirro.bio/features/samples/#using-auto-population

def test_file_name_validation( self, file_names: list[str], file_name_patterns: list[str]) -> Matches: View Source

74    def test_file_name_validation(self,
75                                  file_names: list[str],
76                                  file_name_patterns: list[str]) -> Matches:
77        """
78        Tests the file name validation for a list of file names against specified regex patterns.
79        """
80        request_body = ValidateFileNamePatternsRequest(
81            file_names=file_names,
82            file_name_patterns=file_name_patterns
83        )
84
85        matches = validate_file_name_patterns.sync(
86            process_id="test",
87            body=request_body,
88            client=self.client.api_client
89        )
90        return Matches(matches)

Tests the file name validation for a list of file names against specified regex patterns.

def generate_samplesheets_for_dataset( self, project_id: str, dataset_id: str) -> cirro_api_client.v1.models.SampleSheets: View Source

 92    def generate_samplesheets_for_dataset(self, project_id: str, dataset_id: str) -> SampleSheets:
 93        """
 94        Generates Cirro samplesheets for a given dataset
 95        """
 96        return get_sample_sheets.sync(
 97            project_id=project_id,
 98            dataset_id=dataset_id,
 99            client=self.client.api_client
100        )

Generates Cirro samplesheets for a given dataset

def rerun_sample_ingest_for_dataset(self, project_id: str, dataset_id: str): View Source

102    def rerun_sample_ingest_for_dataset(self, project_id: str, dataset_id: str):
103        """
104        Reruns the sample ingest process for a given dataset.
105        You'll want to do this if you have updated the file name patterns in your pipeline (or data type)
106        """
107        ingest_samples.sync_detailed(
108            project_id=project_id,
109            dataset_id=dataset_id,
110            client=self.client.api_client
111        )

Reruns the sample ingest process for a given dataset. You'll want to do this if you have updated the file name patterns in your pipeline (or data type)