cirro.helpers.preprocess_dataset API documentation

PreprocessDataset( samplesheet: Union[pandas.core.frame.DataFrame, str, pathlib.Path], files: Union[pandas.core.frame.DataFrame, str, pathlib.Path], params: dict = None, metadata: dict = None, dataset_root: str = None) View Source

118    def __init__(self,
119                 samplesheet: Union['DataFrame', str, Path],
120                 files: Union['DataFrame', str, Path],
121                 params: dict = None,
122                 metadata: dict = None,
123                 dataset_root: str = None):
124        import pandas as pd
125        # Convert DataFrame to string if necessary
126        if isinstance(samplesheet, str):
127            samplesheet = pd.read_csv(StringIO(samplesheet))
128        if isinstance(samplesheet, Path):
129            samplesheet = read_csv(str(samplesheet))
130        if isinstance(files, str):
131            files = pd.read_csv(StringIO(files))
132        if isinstance(files, Path):
133            files = read_csv(str(files))
134        if params is None:
135            params = {}
136        if metadata is None:
137            metadata = {}
138
139        self.samplesheet = samplesheet
140        self.files = files
141        self.params = params
142        self.metadata = metadata
143        self.dataset_root = dataset_root
144        self.logger = logger

samplesheet: pandas.core.frame.DataFrame

A pandas DataFrame containing all of the metadata assigned to the samples present in the input datasets (at the time of analysis).

More info: https://docs.cirro.bio/pipelines/preprocess-script/#dssamplesheet

files: pandas.core.frame.DataFrame

A DataFrame containing information on the files contained in the input datasets, and the sample that each file is assigned to.

More info: https://docs.cirro.bio/pipelines/preprocess-script/#dsfiles

params: dict

A dictionary with all of the parameter values populated by user input using the process-form.json and process-input.json configurations.

This is read-only, use add_param to add new parameters or remove_param to remove them.

More info: https://docs.cirro.bio/pipelines/preprocess-script/#dsparams

metadata: dict

Detailed information about the dataset at the time of analysis, including the project, process, and input datasets.

More info: https://docs.cirro.bio/pipelines/preprocess-script/#dsmetadata

dataset_root: str

Base path to the dataset

logger

@classmethod

def from_path(cls, dataset_root: str, config_directory='config'): View Source

146    @classmethod
147    def from_path(cls, dataset_root: str, config_directory='config'):
148        """
149        Creates an instance from a path
150        (useful for testing or when running the script outside Cirro)
151        """
152        config_directory = Path(dataset_root, config_directory)
153
154        files = read_csv(
155            str(Path(config_directory, "files.csv")),
156            required_columns=["sample", "file"]
157        )
158
159        samplesheet = read_csv(
160            str(Path(config_directory, "samplesheet.csv")),
161            required_columns=["sample"]
162        )
163
164        params = read_json(
165            str(Path(config_directory, "params.json")),
166        )
167
168        metadata = read_json(
169            str(Path(config_directory, "metadata.json")),
170        )
171
172        return cls(files=files,
173                   samplesheet=samplesheet,
174                   params=params,
175                   metadata=metadata,
176                   dataset_root=dataset_root)

Creates an instance from a path (useful for testing or when running the script outside Cirro)

@classmethod

def from_running(cls): View Source

178    @classmethod
179    def from_running(cls):
180        """
181        Creates an instance from the currently running dataset
182        (expected to be called from inside a Cirro analysis process)
183        """
184        logging.basicConfig(level=logging.INFO,
185                            format='%(asctime)s %(levelname)-8s [PreprocessDataset] %(message)s')
186        dataset_path = os.getenv("PW_S3_DATASET")
187        return cls.from_path(dataset_path)

Creates an instance from the currently running dataset (expected to be called from inside a Cirro analysis process)

references_base View Source

189    @property
190    def references_base(self):
191        """
192        Returns the base URL for references.
193        This is used to access public references in the Cirro system.
194        """
195        return self._REFERENCES_BASE

Returns the base URL for references. This is used to access public references in the Cirro system.

def log(self): View Source

197    def log(self):
198        """Print logging messages about the dataset."""
199        logger.info(f"Storage location for dataset: {self.dataset_root}")
200        logger.info(f"Number of files in dataset: {self.files.shape[0]:,}")
201        logger.info(f"Number of samples in dataset: {self.samplesheet.shape[0]:,}")

Print logging messages about the dataset.

def add_param(self, name: str, value, overwrite=False, log=True): View Source

203    def add_param(self, name: str, value, overwrite=False, log=True):
204        """Add a parameter to the dataset."""
205
206        assert overwrite or name not in self.params, \
207            f"Cannot add parameter {name}, already exists (and overwrite=False)"
208
209        if log:
210            logger.info(f"Adding parameter {name} = {value}")
211        self.params[name] = value
212
213        if log:
214            logger.info("Saving parameters")
215        write_json(self.params, self._PARAMS_FILE)

Add a parameter to the dataset.

def remove_param(self, name: str, force=False): View Source

217    def remove_param(self, name: str, force=False):
218        """Remove a parameter from the dataset."""
219
220        assert force or name in self.params, \
221            f"Cannot remove parameter {name}, does not exist (and force=False)"
222
223        logger.info(f"Removing parameter {name}")
224        if name in self.params:
225            del self.params[name]
226
227        logger.info("Saving parameters")
228        write_json(self.params, self._PARAMS_FILE)

Remove a parameter from the dataset.

def keep_params(self, params_to_keep: list[str]): View Source

230    def keep_params(self, params_to_keep: list[str]):
231        """Keep only the specified parameters in the dataset."""
232        logger.info(f"Keeping parameters: {params_to_keep}")
233        self.params = {
234            k: v for k, v in self.params.items()
235            if k in params_to_keep
236        }
237        write_json(self.params, self._PARAMS_FILE)

Keep only the specified parameters in the dataset.

def update_compute(self, from_str: str, to_str: str, fp='nextflow-override.config'): View Source

239    def update_compute(self, from_str: str, to_str: str, fp="nextflow-override.config"):
240        """Replace all instances of a text string in the compute config file."""
241
242        assert os.path.exists(fp), f"File does not exist: {fp}"
243        with open(fp, 'r') as handle:
244            compute = handle.read()
245        n = len(compute.split(from_str)) - 1
246        logger.info(f"Replacing {n:,} instances of {from_str} with {to_str} in {fp}")
247        compute = compute.replace(from_str, to_str)
248        with open(fp, 'wt') as handle:
249            handle.write(compute)

Replace all instances of a text string in the compute config file.

def pivot_samplesheet( self, index=None, pivot_columns: Union[str, NoneType, list[str]] = 'read', metadata_columns: list[str] = None, column_prefix: str = 'fastq_', file_filter_predicate: str = None): View Source

251    def pivot_samplesheet(
252            self,
253            index=None,
254            pivot_columns: Union[Optional[str], list[str]] = 'read',
255            metadata_columns: list[str] = None,
256            column_prefix: str = "fastq_",
257            file_filter_predicate: str = None
258    ):
259        """
260        Combines data from both the samples and files table into a wide format with
261        each sample on a row and each file in a column.
262        The file column indexes are created by default from the `read` column, but can be customized.
263
264        For example, if the `files` table has columns `sample`, `read`, and `file`,
265        and the `samplesheet` has columns `sample`, `status`, and `group`, the output
266        will have columns `sample`, `fastq_1`, `fastq_2`, `status`, and `group`.
267
268        Args:
269            index: List[str], used to make the frames new index, defaults to
270             `["sampleIndex", "sample", "lane"]`
271            pivot_columns: str or List[str], columns to pivot on and create the new column,
272             defaults to 'read'. This effectively makes the column `<column_prefix><read>'.
273             If the column is not defined or not present, the pivot column will be generated
274             from the file number index.
275            metadata_columns: List[str], metadata columns to include in the output,
276             defaults to all columns that are available from the sample metadata.
277             If your pipeline doesn't like extra columns, make sure to specify the allowed columns here.
278            column_prefix: str, optional, prefix for the new columns, defaults to `fastq_`.
279            file_filter_predicate: str, optional, a pandas query string to filter the files table.
280             A common use case would be to filter out indexed reads, e.g. `readType == "R"`.
281
282        Returns:
283            DataFrame: A wide-format sample sheet with the specified columns pivoted.
284        """
285        import pandas as pd
286
287        pivoted_files = self.pivot_files(index=index,
288                                         pivot_columns=pivot_columns,
289                                         column_prefix=column_prefix,
290                                         file_filter_predicate=file_filter_predicate)
291        combined = pd.merge(pivoted_files, self.samplesheet, on='sample', how="inner", validate="many_to_many")
292
293        # Default to keeping all columns
294        if metadata_columns is None:
295            metadata_columns = self.samplesheet.columns.tolist() + pivoted_files.columns.tolist()
296
297        # Keep only the specified metadata columns
298        all_columns = combined.axes[1]
299        for column in all_columns:
300            if (column not in metadata_columns
301                    # These columns are required, never drop them
302                    and column_prefix not in column
303                    and 'sample' != column):
304                combined = combined.drop(columns=[column])
305
306        return combined

Combines data from both the samples and files table into a wide format with each sample on a row and each file in a column. The file column indexes are created by default from the read column, but can be customized.

For example, if the files table has columns sample, read, and file, and the samplesheet has columns sample, status, and group, the output will have columns sample, fastq_1, fastq_2, status, and group.

Arguments:

index: List[str], used to make the frames new index, defaults to ["sampleIndex", "sample", "lane"]
pivot_columns: str or List[str], columns to pivot on and create the new column, defaults to 'read'. This effectively makes the column `'. If the column is not defined or not present, the pivot column will be generated from the file number index.
metadata_columns: List[str], metadata columns to include in the output, defaults to all columns that are available from the sample metadata. If your pipeline doesn't like extra columns, make sure to specify the allowed columns here.
column_prefix: str, optional, prefix for the new columns, defaults to fastq_.
file_filter_predicate: str, optional, a pandas query string to filter the files table. A common use case would be to filter out indexed reads, e.g. readType == "R".

Returns:

DataFrame: A wide-format sample sheet with the specified columns pivoted.

def pivot_files( self, index: list[str] = None, pivot_columns: Union[str, list[str]] = 'read', column_prefix: str = 'fastq_', file_filter_predicate: str = None): View Source

308    def pivot_files(
309            self,
310            index: list[str] = None,
311            pivot_columns: Union[str, list[str]] = 'read',
312            column_prefix: str = "fastq_",
313            file_filter_predicate: str = None
314    ):
315        """
316        Format the files table into a wide format with each sample on a row
317        and each file in a column. The column indexes are created by default
318        from the `read` column, but can be customized. This is useful for
319        paired-end sequencing data where you want to have the columns
320        `sample`, `fastq_1`, and `fastq_2` as the output.
321
322        Args:
323            index: List[str], used to make the frames new index, defaults to
324            pivot_columns: str or List[str], columns to pivot on and create the new column,
325             defaults to 'read'. This effectively makes the column `<column_prefix><read>`
326            column_prefix: str, optional, prefix for the new columns, defaults to `fastq_`.
327            file_filter_predicate: str, optional, a pandas query string to filter the files table.
328
329        Returns:
330            DataFrame: A wide-format sample sheet with the specified columns pivoted.
331        """
332        if index is None:
333            index = ["sampleIndex", "sample", "lane"]
334        logger.info("Formatting a wide files table")
335        logger.info("File table (long)")
336        logger.info(self.files.head().to_csv(index=False))
337
338        files = self.files
339
340        if file_filter_predicate is not None:
341            # Filter the files table based on the predicate
342            files = files.query(file_filter_predicate)
343
344        # If we don't have access to the column defined, just use the file number
345        # By default this is 'read' but the data might not be paired
346        pivot_columns_defined = pivot_columns is not None and len(pivot_columns) > 0
347        if not pivot_columns_defined or pivot_columns not in files.columns.values:
348            logger.warning("Pivot column not found, grouping by sample instead.")
349            files['file_num'] = files.groupby('sample').cumcount() + 1
350            pivot_columns = 'file_num'
351
352        if isinstance(pivot_columns, str):
353            pivot_columns = [pivot_columns]
354
355        assert pivot_columns in files.columns.values, f"Column '{pivot_columns}' not found in file table"
356        assert 'file' in files.columns.values, "Column 'file' must be present in the file table"
357        assert isinstance(index, list), f"index must be a list (not {type(index)})"
358
359        # Get the list of columns from the inputs
360        input_columns = files.columns.values
361
362        # Format as a wide dataset
363        # Note that all the columns in `index` will be added if they are not already present
364        wide_df = files.reindex(
365            columns=index + pivot_columns + ['file']
366        )
367        wide_df = wide_df.pivot(
368            index=index,
369            columns=pivot_columns,
370            values='file'
371        )
372        # Rename the columns to have a prefix, e.g. 'fastq_'
373        wide_df = wide_df.rename(
374            columns=lambda i: f"{column_prefix}{int(i)}"
375        )
376        wide_df = wide_df.reset_index()
377
378        # Remove any columns from the output which were added from `index`
379        for cname in index:
380            if cname not in input_columns:
381                wide_df = wide_df.drop(columns=[cname])
382        # Remove any extra unnecessary columns
383        wide_df = wide_df.drop(columns=pivot_columns, errors='ignore')
384        return wide_df

Format the files table into a wide format with each sample on a row and each file in a column. The column indexes are created by default from the read column, but can be customized. This is useful for paired-end sequencing data where you want to have the columns sample, fastq_1, and fastq_2 as the output.

Arguments:

index: List[str], used to make the frames new index, defaults to
pivot_columns: str or List[str], columns to pivot on and create the new column, defaults to 'read'. This effectively makes the column <column_prefix><read>
column_prefix: str, optional, prefix for the new columns, defaults to fastq_.
file_filter_predicate: str, optional, a pandas query string to filter the files table.

Returns:

DataFrame: A wide-format sample sheet with the specified columns pivoted.

def wide_samplesheet( self, index=None, columns='read', values='file', column_prefix='fastq_'): View Source

386    def wide_samplesheet(
387            self,
388            index=None,
389            columns='read',
390            values="file",  # noqa
391            column_prefix="fastq_"
392    ):
393        """
394        Format the samplesheet into a wide format with each sample on a row
395
396        This is a legacy method, please use `pivot_samplesheet` instead.
397        """
398        warnings.warn("`wide_samplesheet` is deprecated, use `pivot_samplesheet` instead.",
399                      DeprecationWarning, stacklevel=2)
400        if values != "file":
401            raise ValueError("The only supported value for `values` is 'file'")
402        return self.pivot_files(index=index, pivot_columns=[columns], column_prefix=column_prefix)

Format the samplesheet into a wide format with each sample on a row

This is a legacy method, please use pivot_samplesheet instead.