Source code for genedom.batch_domestication

from copy import deepcopy
import itertools

from Bio import SeqIO
import pandas
import proglog

import flametree
from sequenticon import sequenticon

from .PartDomesticator import PartDomesticator
from .reports import write_pdf_domestication_report
from .biotools import (sanitize_and_uniquify, sequence_to_record,
                       annotate_record, write_record)

[docs]def batch_domestication(records, target, domesticator=None, standard=None, allow_edits=False, domesticated_suffix="", include_optimization_reports=True, include_original_records=True, barcodes=(), barcode_order='same_as_records', barcode_spacer='AA', logger="bar"): """Domesticate a batch of parts according to some domesticator/standard. Examples -------- >>> from genedom import BUILTIN_STANDARDS, batch_domestication >>> batch_domestication(some_records, standard=BUILTIN_STANDARDS.EMMA) Parameters ---------- records List of Bioython records to be domesticated target Path to a folder, to a zip file, or "@memory" for in-memory report generatio (the raw binary data of a zip archive is then returned) domesticator Either a single domesticator, to be used for all parts in the batch, or a function f(record) => appropriate_domesticator. Note that a "standard" can be provided instead standard A StandardDomesticatorsSet object which will be used to attribute a specific domesticator to each part. See BUILTIN_STANDARDS for examples. allow_edits If False, sequences cannot be edited by the domesticator, only extended with flanks. If a sequence has for instance forbidden restriction sites, the domesticaton will fail for this sequence (and this will be noted in the report. domesticated_suffix Suffix to give to the domesticated parts names to differentiate them from the original parts (this is optional). include_optimization_reports If yes, some genbanks and pdfs will be produced to show how each part was domesticated. This is in particular informative when a domestication fails and you want to understand why. include_original_records Will include the input records into the final report folder/archive, for traceability. barcodes Either a list [(barcode_name, barcode),...] or a dictionary {name: bc} or a BarcodesCollection instance. If any of this is provided, the final parts will have a barcode added on the left (this barcode will be "outside" the part and won't appear in final constructs, but can be used to check that the part is the one you think if your samples get mixed up). Note that if there are less barcodes than parts, the barcodes will cycle and several parts may get the same barcode (which is generally fine). barcode_order Either "same_as_records", or "by_size" if you want your barcodes to be attributed from the smallest to the longest part in the batch. barcode_spacer Sequence to appear between the barcode and the left flank of the domesticated part. logger Either "bar" or None for no logger or any Proglog ProgressBarLogger. """ logger = proglog.default_bar_logger(logger, min_time_interval=0.2) root = flametree.file_tree(target, replace=True) domesticated_dir = root._dir("domesticated") if include_original_records: original_dir = root._dir("original") if include_optimization_reports: errors_dir = root._dir("error_reports") if standard is not None: domesticator = standard.record_to_domesticator if hasattr(barcodes, 'items'): barcodes = list(barcodes.items()) if len(barcodes): barcodes = [b for b, r in zip(itertools.cycle(barcodes), records)] if barcode_order == 'by_size': lengths = [len(r) for r in records] barcodes = [b for _, b in sorted(zip(lengths, barcodes))] infos = [] domesticators = set() nfails = 0 domesticated_records = [] columns = ["Record", "Ordering Name", "Domesticator", "Domesticated Record", "Added bp", "Edited bp"] # DOMESTICATE ALL PARTS, APPEND BARCODE, GATHER DATA for i, record in logger.iter_bar(record=list(enumerate(records))): record = deepcopy(record) original_id = record.id domesticated_id = record.id + domesticated_suffix domesticated_file_name = domesticated_id + ".gb" if isinstance(domesticator, PartDomesticator): record_domesticator = domesticator else: record_domesticator = domesticator(record) domesticators.add(record_domesticator) if include_optimization_reports: report_target = errors_dir._dir(record.id) else: report_target = None if len(barcodes): barcode = barcodes[i] if not isinstance(barcode, str): barcode_id, barcode = barcode barcode_id = " " + barcode_id else: barcode_id = "" barcode = sequence_to_record(barcode) annotate_record(barcode, label="BARCODE" + barcode_id) else: barcode = None # final, edits, report, success, msg domestication_results = record_domesticator.domesticate( record, report_target=report_target, edit=allow_edits) if not domestication_results.success: nfails += 1 if barcode is not None: domestication_results.record_after = ( barcode + barcode_spacer + domestication_results.record_after) domestication_results.record_after.original_id = original_id domestication_results.record_after.id = domesticated_id.replace(' ', '_') SeqIO.write(domestication_results.record_after, domesticated_dir._file(domesticated_file_name), "genbank") domesticated_records.append(domestication_results.record_after) if include_original_records: write_record(domestication_results.record_after, original_dir._file(original_id + ".gb")) n_edits = domestication_results.number_of_edits() added_bp = len(domestication_results.record_after) - len(record) before_seqicon = sequenticon(record, output_format="html_image") after_seqicon = sequenticon(domestication_results.record_after, output_format="html_image") infos.append({ "id": original_id, "Record": before_seqicon + original_id, "Domesticator": record_domesticator.name, "Domesticated Record": ("Failed: " + domestication_results.message) if not domestication_results.success else (after_seqicon + domesticated_id), "Added bp": added_bp, "Edited bp": n_edits }) if barcode is not None: infos[-1]['Barcode'] = barcode_id # WRITE PDF REPORT sanitizing_table = sanitize_and_uniquify([info['id'] for info in infos]) order_id_dataframe = pandas.DataFrame(list(sanitizing_table.items()), columns=["sequence", "order_id"]) order_id_dataframe.to_csv(root._file("order_ids.csv").open('w'), index=False) for info in infos: info['Order ID'] = sanitizing_table[info['id']] columns = ["Record", "Order ID", "Domesticator", "Domesticated Record", "Added bp", "Edited bp"] if "Barcode" in infos[0]: columns.append("Barcode") infos_dataframe = pandas.DataFrame(infos, columns=columns) infos_dataframe.sort_values('Order ID', inplace=True) domesticators = sorted(domesticators, key=lambda d: d.name) write_pdf_domestication_report(root._file("Report.pdf"), infos_dataframe, domesticators) # WRITE THE SEQUENCES TO ORDER AS FASTA order_dir = root._dir('sequences_to_order', replace=True) for r in domesticated_records: r.id = sanitizing_table[r.original_id] r.name = '' r.description = '' SeqIO.write(domesticated_records, order_dir._file("sequences_to_order.fa"), "fasta") # WRITE THE SEQUENCES TO ORDER AS EXCEL df = pandas.DataFrame.from_records( sorted([ {'sequence': str(rec.seq).upper(), 'length': len(rec), 'sequence name': rec.id} for rec in domesticated_records ], key=lambda d: d['sequence name']), columns=['sequence name', 'length', 'sequence'] ) df.to_excel(order_dir._file("sequences_to_order.xls").open("wb"), index=False) df.to_csv(order_dir._file("all_domesticated_parts.csv").open("w"), index=False) return nfails, root._close()