U
    H$xea                  	   @   s   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZmZmZmZmZmZmZmZmZ ddlmZmZmZ ddlZddlZddlmZ ddlmZmZm Z  ddl!m"Z"m#Z$ dd	l%m&Z& dd
l'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<m=Z=m>Z> ddl?m@Z@mAZA ddlBmCZCmDZD ddlEmFZFmGZGmHZH ddlImJZJmKZKmLZL ddlMmNZNmOZO ddlPmQZQ eR ZSG dd deZTG dd deUZVedddZWeXeeYeYf dddZZeXeeeY df dddZ[eHeeeX  eeeX  eGdd d!Z\eeX d"d#d$Z]eeX eeX eXeXe^d%d&d'Z_eeX eXeeX eeX eeX e^d(d)d*Z`eeX eeX eeXe^f d+d,d-Zae^dd.d/ZbeeX e^e^eeXdf d0d1d2Zce^dd3d4d5ZdG d6d7 d7Zeeee ee f dd8d9ZfeeY eeY e^d:d;d<ZgeeX eeX eYe^d=d>d?Zhe^e^eeX eYe^e^e^d@dAdBZiee( ddCdDZjdEdF ZkdGdH Zlejmjnfe>ddIdJZodKdL ZpdMdN ZqdOdP ZrdQdR ZseYeeY dSdTdUZtdVdW Zue>eeX eXeeX eYe^evedXdYdZZwexd[kreyel  dS )\a  
cutadapt version {version}

Copyright (C) 2010 Marcel Martin <marcel.martin@scilifelab.se> and contributors

Cutadapt removes adapter sequences from high-throughput sequencing reads.

Usage:
    cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq

For paired-end reads:
    cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq

Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard
characters are supported. All reads from input.fastq will be written to
output.fastq with the adapter sequence removed. Adapter matching is
error-tolerant. Multiple adapter sequences can be given (use further -a
options), but only the best-matching adapter will be removed.

Input may also be in FASTA format. Compressed input and output is supported and
auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for
standard input/output. Without the -o option, output is sent to standard output.

Citation:

Marcel Martin. Cutadapt removes adapter sequences from high-throughput
sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011.
http://dx.doi.org/10.14806/ej.17.1.200

Run "cutadapt --help" to see all command-line options.
See https://cutadapt.readthedocs.io/ for full documentation.
    N)Path)	TupleOptionalSequenceListAnyIteratorUnionDictIterable)ArgumentParserSUPPRESSHelpFormatter)__version__)warn_duplicate_adaptersAdapterInvalidCharacter)OneLinedumps)!make_adapters_from_specifications)SingleEndModifierLengthTagModifierSuffixRemoverPrefixSuffixAdder
ZeroCapperQualityTrimmerUnconditionalCutterNEndTrimmerAdapterCutterPairedAdapterCutterErrorPairedAdapterCutterNextseqQualityTrimmer	ShortenerReverseComplementerPairedEndRenamerRenamerInvalidTemplatePolyATrimmerPairedReverseComplementer)full_reportminimal_report
Statistics)SingleEndPipelinePairedEndPipeline)Pipelinerun_pipeline)
InputPathsOutputFiles
FileOpener)available_cpu_countProgressDummyProgress)setup_loggingREPORT)HasNoQualitiesc                       s8   e Zd ZdZG dd deZ fddZdd Z  ZS )CutadaptArgumentParserz
    This ArgumentParser customizes two things:
    - The usage message is not prefixed with 'usage:'
    - A brief message is shown on errors, not full usage
    c                       s&   e Zd Z fddZdddZ  ZS )z/CutadaptArgumentParser.CustomUsageHelpFormatterc                    s&   t dt j|d< t j|| d S )Nh   width)minshutilget_terminal_sizecolumnssuper__init__selfargskwargs	__class__ +lib/python3.8/site-packages/cutadapt/cli.pyrA   u   s    z8CutadaptArgumentParser.CustomUsageHelpFormatter.__init__Nc                 C   s&   |t k	r"|||df}| | j| d S )N )r   Z	_add_itemZ_format_usage)rC   usageZactionsgroupsprefixrD   rH   rH   rI   	add_usagey   s    z9CutadaptArgumentParser.CustomUsageHelpFormatter.add_usage)N)__name__
__module____qualname__rA   rN   __classcell__rH   rH   rF   rI   CustomUsageHelpFormattert   s   rS   c                    s0   | j |d< |d dt|d< t j|| d S )NZformatter_classrK   z	{version})rS   replacer   r@   rA   rB   rF   rH   rI   rA   ~   s    
zCutadaptArgumentParser.__init__c                 C   s<   t dtjd t dtjd | dd| j d| d dS )z
        If you override this in a subclass, it should not return -- it
        should either exit or raise an exception.
        z2Run "cutadapt --help" to see command-line options.filez<See https://cutadapt.readthedocs.io/ for full documentation.   
z	: error: N)printsysstderrexitprog)rC   messagerH   rH   rI   error   s    zCutadaptArgumentParser.error)	rO   rP   rQ   __doc__r   rS   rA   r_   rR   rH   rH   rF   rI   r9   m   s   
r9   c                   @   s   e Zd ZdS )CommandLineErrorN)rO   rP   rQ   rH   rH   rH   rI   ra      s   ra   returnc               
   C   s  t tdd} | d}|jddddd |jd	d
dtd |jddddd |jdddtd |jddtddd |jdtdtd |jdtdtd |jdtdtd |jdd d!d"td# | jd$d%d&}|jd'd(d)d* d+g d,d-d.d/ |jd0d1d2d* d+g d,d-d3d/ |jd4d5d6d* d+g d,d-d7d/ |jd8d9d:td;d<d=d> |jd?d"d@d!dAdB |jdCdDtdEddFd> |jdGdHtdIdJdKd> |jdLdddMd |jdNdOd"d!dPdQdR |jdSdTdUdVdW |jdXdYdZddd[d# |jd\d]d^d_td` |jdad]d^dbtd` | dc}|jddded+g tdfdgdh |jditd djdkdl |jdmdnd dodpdq |jdrtdsdtdudl |jdvdddwd |jdxdytd dzd{dl |jd|ddd}d |jd~ddd |jdd+g dd |jddddd |jddddd |jdddd |jdddddd | jddd&}|jddd dddq |jddd dddq |jdtd dEddl |jddtd dddl |jddtd dddl |jdddddd |jdddddd |jddddd | d}|jddddd |jddd ddW |jdddd |jddddd |jddddd |jdd^dddd |jdddd |jddddd |jdddd |jdddd |jdddd |jdd dddq | jddd&}|jddd* dd+g d,ddӍ |jddd* dd+g d,ddӍ |jddd* dd+g d,ddӍ |jddd+g tdzddݍ |jddd dodd |jddddd |jdddd |jdd ddd |jddddd |jdddd |jddd dd |jddd dd | jddtd | S )NF)rK   Zadd_helpZOptionsz-hz--helphelpzShow this help message and exit)actionrd   z	--versionversionzShow version number and exit)re   rd   rf   z--debugcountr   z4Print debug log. Use twice to also print DP matrices)re   defaultrd   z	--profile
store_truez-jz--cores   zFNumber of CPU cores to use. Use 0 to auto-detect. Default: %(default)s)typerh   rd   z--gc-content2   z--buffer-sizei 	= z--compression-level   z
--no-indexindexTZstore_false)destrh   re   rd   zFinding adaptersa  Parameters -a, -g, -b specify adapters to be removed from each read (or from R1 if data is paired-end. If specified multiple times, only the best matching adapter is trimmed (but see the --times option). Use notation 'file:FILE' to read adapter sequences from a FASTA file.)Zdescriptionz-az	--adapterc                 S   s   d| fS NZbackrH   xrH   rH   rI   <lambda>       z%get_argument_parser.<locals>.<lambda>appendZADAPTERadapterszSequence of an adapter ligated to the 3' end (paired data: of the first read). The adapter and subsequent bases are trimmed. If a '$' character is appended ('anchoring'), the adapter is only found if it is a suffix of the read.)rk   re   rh   metavarro   rd   z-gz--frontc                 S   s   d| fS NZfrontrH   rq   rH   rH   rI   rs      rt   a  Sequence of an adapter ligated to the 5' end (paired data: of the first read). The adapter and any preceding bases are trimmed. Partial matches at the 5' end are allowed. If a '^' character is prepended ('anchoring'), the adapter is only found if it is a prefix of the read.z-bz
--anywherec                 S   s   d| fS NZanywhererH   rq   rH   rH   rI   rs      rt   a  Sequence of an adapter that may be ligated to the 5' or 3' end (paired data: of the first read). Both types of matches as described under -a and -g are allowed. If the first base of the read is part of the match, the behavior is as with -g, otherwise as with -a. This option is mostly for rescuing failed library preparations - do not use if you know which end your adapter was ligated to!z-ez--error-ratez--errorsEg?zMaximum allowed error rate (if 0 <= E < 1), or absolute number of errors for full-length adapter match (if E is an integer >= 1). Error rate = no. of errors divided by length of matching region. Default: %(default)s (10%%))rk   rw   rh   rd   z--no-indelsindelszNAllow only mismatches in alignments. Default: allow both mismatches and indels)re   ro   rh   rd   z-nz--timesZCOUNTz@Remove up to COUNT adapters from each read. Default: %(default)sz-Oz	--overlapZ	MINLENGTH   zcRequire MINLENGTH overlap between read and adapter for an adapter to be found. Default: %(default)sz--match-read-wildcardsz8Interpret IUPAC wildcards in reads. Default: %(default)sz-Nz--no-match-adapter-wildcardsmatch_adapter_wildcardsz-Do not interpret IUPAC wildcards in adapters.)re   rh   ro   rd   z--action)trimZretainmaskZ	lowercasenoner~   zWhat to do if a match was found. trim: trim adapter and up- or downstream sequence; retain: trim, but retain adapter; mask: replace with 'N' characters; lowercase: convert to lowercase; none: leave unchanged. Default: %(default)s)choicesrh   rd   z--rcz	--revcompreverse_complementzCheck both the read and its reverse complement for adapter matches. If match is on reverse-complemented version, output that one. Default: check only readz	--no-trimre   Zstore_constr   )ro   re   constrd   z--mask-adapterr   zAdditional read modificationsz-uz--cutZLENzRemove LEN bases from each read (or R1 if paired; use -U option for R2). If LEN is positive, remove bases from the beginning. If LEN is negative, remove bases from the end. Can be used twice if LENs have different signs. Applied *before* adapter trimming.)re   rh   rk   rw   rd   z--nextseq-trimz3'CUTOFFzhNextSeq-specific quality trimming (each read). Trims also dark cycles appearing as high-quality G bases.)rk   rh   rw   rd   z-qz--quality-cutoffz[5'CUTOFF,]3'CUTOFFa!  Trim low-quality bases from 5' and/or 3' ends of each read before adapter removal. Applied to both reads if data is paired. If one value is given, only the 3' end is trimmed. If two comma-separated cutoffs are given, the 5' end is trimmed with the first cutoff, the 3' end with the second.)rh   rw   rd   z--quality-base!   NzAssume that quality values in FASTQ are encoded as ascii(quality + N). This needs to be set to 64 for some old Illumina FASTQ files. Default: %(default)sz--poly-azTrim poly-A tailsz--lengthz-lZLENGTHzShorten reads to LENGTH. Positive values remove bases at the end while negative ones remove bases at the beginning. This and the following modifications are applied after adapter trimming.z--trim-nzTrim N's on ends of reads.z--length-tagZTAGzSearch for TAG followed by a decimal number in the description field of the read. Replace the decimal number with the correct length of the trimmed read. For example, use --length-tag 'length=' to correct fields like 'length=123'.)rw   rd   z--strip-suffixzKRemove this suffix from read names if present. Can be given multiple times.z-xz--prefixrJ   zUAdd this prefix to read names. Use {name} to insert the name of the matching adapter.)rh   rd   z-yz--suffixz6Add this suffix to read names; can also include {name}z--renameTEMPLATEzfRename reads using TEMPLATE containing variables such as {id}, {adapter_name} etc. (see documentation)z
--zero-capz-zz'Change negative quality values to zero.zFiltering of processed readsz|Filters are applied after above read modifications. Paired-end reads are always discarded pairwise (see also --pair-filter).z-mz--minimum-lengthz
LEN[:LEN2]z*Discard reads shorter than LEN. Default: 0z-Mz--maximum-lengthz0Discard reads longer than LEN. Default: no limitz--max-nzDiscard reads with more than COUNT 'N' bases. If COUNT is a number between 0 and 1, it is interpreted as a fraction of the read length.z--max-expected-errorsz--max-eeZERRORSz\Discard reads whose expected number of errors (computed from quality values) exceeds ERRORS.z--max-average-error-ratez	--max-aerZ
ERROR_RATEzcas --max-expected-errors (see above), but divided by length to account for reads of varying length.z--discard-trimmedz	--discardzhDiscard reads that contain an adapter. Use also -O to avoid discarding too many randomly matching reads.z--discard-untrimmedz--trimmed-onlyz-Discard reads that do not contain an adapter.z--discard-casavazBDiscard reads that did not pass CASAVA filtering (header has :Y:).ZOutputz--quietzPrint only error messages.)rh   re   rd   z--report)ZfullminimalzAWhich type of report to print: 'full' or 'minimal'. Default: fullz--jsonZFILEz"Dump report in JSON format to FILEz-oz--outputzWrite trimmed reads to FILE. FASTQ or FASTA format is chosen depending on input. Summary report is sent to standard output. Use '{name}' for demultiplexing (see docs). Default: write to standard outputz--fastaz4Output FASTA to standard output even on FASTQ input.z-Zcompression_levelzNUse compression level 1 for gzipped output files (faster, but uses more space))re   r   ro   rd   z--info-filezoWrite information about each read and its adapter matches into FILE. See the documentation for the file format.z-rz--rest-filez]When the adapter matches in the middle of a read, write the rest (after the adapter) to FILE.z--wildcard-filezyWhen the adapter has N wildcard bases, write adapter bases matching wildcard positions to FILE. (Inaccurate with indels.)z--too-short-outputzdWrite reads that are too short (according to length specified by -m) to FILE. Default: discard readsz--too-long-outputzcWrite reads that are too long (according to length specified by -M) to FILE. Default: discard readsz--untrimmed-outputzbWrite reads that do not contain any adapter to FILE. Default: output to same file as trimmed readszPaired-end optionsznThe -A/-G/-B/-U/-Q options work like their lowercase counterparts, but are applied to R2 (second read in pair)z-Ac                 S   s   d| fS rp   rH   rq   rH   rH   rI   rs   ^  rt   	adapters2z 3' adapter to be removed from R2)rk   ro   re   rh   rw   rd   z-Gc                 S   s   d| fS rx   rH   rq   rH   rH   rI   rs   a  rt   z 5' adapter to be removed from R2z-Bc                 S   s   d| fS ry   rH   rq   rH   rH   rI   rs   d  rt   z"5'/3 adapter to be removed from R2z-Ucut2zRemove LENGTH bases from R2)ro   re   rh   rk   rw   rd   z-Qquality_cutoff2z7Quality-trimming cutoff for R2. Default: same as for R1)ro   rh   rw   rd   z-pz--paired-outputzWrite R2 to FILE.z--pair-adapterszcTreat adapters given with -a/-A etc. as pairs. Either both or none are removed from each read pair.z--pair-filter)anyZbothfirstzWhich of the reads in a paired-end read have to match the filtering criterion in order for the pair to be filtered. Default: any)rh   r   rd   z--interleavedz/Read and/or write interleaved paired-end reads.z--untrimmed-paired-outputzWrite second read in a pair to this FILE when no adapter was found. Use with --untrimmed-output. Default: output to same file as trimmed readsz--too-short-paired-outputz>Write second read in a pair to this file if pair is too short.)rw   rh   rd   z--too-long-paired-outputz=Write second read in a pair to this file if pair is too long.inputs*)nargsrd   )r9   r`   Zadd_argument_groupadd_argumentr   r   intfloat)parsergrouprH   rH   rI   get_argument_parser   s   





      	
  

 













      



r   )src   c              
   C   s   zdd |  dD }W n0 tk
rH } ztd| W 5 d}~X Y nX t|dkrdd|d g}nt|dkrxtd	|d |d fS )
zParse a string INT[,INT] into a pair of integers

    >>> parse_cutoffs("5")
    (0, 5)
    >>> parse_cutoffs("6,7")
    (6, 7)
    c                 S   s   g | ]}t |qS rH   r   ).0valuerH   rH   rI   
<listcomp>  s     z!parse_cutoffs.<locals>.<listcomp>,z%Quality cutoff value not recognized: Nrj   r   rW   zJExpected one value or two values separated by comma for the quality cutoff)split
ValueErrorra   len)r   ZcutoffserH   rH   rI   parse_cutoffs  s     r   .c              
   C   s   |  d}t|dkrtdztdd |D }W n0 tk
rd } ztd| W 5 d}~X Y nX t|dkr|d	 dkr|d
 dkrtd|  dt|S )zParse [INT][:[INT]] into a pair of integers. If a value is omitted, use None

    >>> parse_lengths('25')
    (25,)
    >>> parse_lengths('17:25')
    (17, 25)
    >>> parse_lengths('25:')
    (25, None)
    >>> parse_lengths(':25')
    (None, 25)
    :)rj   rW   z!Only at most one colon is allowedc                 s   s"   | ]}|d krt |ndV  qdS )rJ   Nr   )r   frH   rH   rI   	<genexpr>  s     z parse_lengths.<locals>.<genexpr>zValue not recognized: NrW   r   rj   zCannot parse 'z(': At least one length needs to be given)r   r   ra   tupler   )r   Zfieldsvaluesr   rH   rH   rI   parse_lengths  s    
 $
r   )file_openeradapter_namesadapter_names2rc   c                 C   s\  t | j| j| j| j| j| j| j| j| j	| j
| jg || jd}|| jd}|| jd}d }}	| jdk	r|| j| jd\}}	d }
}| jdk	r|| j| jd\}
}t| jt| j t| jdk	 dkrtdt| j
| j}|r| jrtd|dkr| jrtd|dkrXd }}d }}t|| j
| j| j| j	| j|\}}}}n|dkrd	| j
krzd
| j
ks~td	| jkrd
| jkst| js| j	rtdd }}d }}d }}t||| j
| j| j|\}}nJd }}d }}|| j| j	d\}}|| j
| jd\}}|dkr2|}t|||||	|
|||||||||| jdS )z
    Return an OutputFiles instance. If demultiplex is True, the untrimmed, untrimmed2, out and out2
    attributes are not opened files, but paths (out and out2 with the '{name}' template).
    wbNrj   zsOnly one of the --discard-trimmed, --discard-untrimmed and --untrimmed-output options can be used at the same time.z1Do not use --discard-trimmed when demultiplexing.combinatorialzWith --pair-adapters, you can only use {name} in your output file name template, not {name1} and {name2} (no combinatorial demultiplexing).normal{name1}{name2}zCombinatorial demultiplexing (with {name1} and {name2}) cannot be combined with --untrimmed-output or --untrimmed-paired-output)restinfowildcard	too_short
too_short2too_long	too_long2	untrimmed
untrimmed2outout2demultiplex_outdemultiplex_out2combinatorial_outcombinatorial_out2Zforce_fasta)complain_about_duplicate_paths	rest_file	info_filewildcard_filetoo_short_outputtoo_short_paired_outputtoo_long_outputtoo_long_paired_outputuntrimmed_outputuntrimmed_paired_outputoutputpaired_outputZxopen_or_noneminimum_lengthZ
xopen_pairmaximum_lengthr   discard_trimmeddiscard_untrimmedra   determine_demultiplex_modepair_adaptersopen_demultiplex_outAssertionErroropen_combinatorial_outr1   Zfasta)rD   default_outfiler   r   r   r   r   r   r   r   r   r   Zdemultiplex_moder   r   r   r   r   r   r   r   rH   rH   rI   open_output_files  s    
  
  



	  
r   )pathsc                 C   sr   t jdkrt jdk rd S t }| D ]J}|d kr0q"t|}| rJ| sJq"||krbtd| d|| q"d S )NZwin32)r|      zPath zQ specified more than once as an output file. This is not supported at the moment.)	rZ   platformversion_infosetr   existsis_filera   add)r   seenpathprH   rH   rI   r   ;  s    
r   )r   r   output_templatepaired_output_templater   c                 C   s   t  }t  }|rg }n*dg}|dd |D 7 }|dd | D 7 }tt| || D ]x\}	}
|	d k	rh|	nd}|
d k	rx|
nd}|d|d|}|d|d|}||d||	|
f< ||d||	|
f< qT||fS )	NNNc                 S   s   g | ]}d |fqS NrH   )r   name2rH   rH   rI   r   ^  s     z*open_combinatorial_out.<locals>.<listcomp>c                 S   s   g | ]}|d fqS r   rH   )r   name1rH   rH   rI   r   _  s     unknownr   r   r   )dictlist	itertoolsproductrT   xopen)r   r   r   r   r   r   r   r   Zextrar   r   Zfname1Zfname2path1path2rH   rH   rI   r   O  s$     r   )r   	template1	template2r   r   r   c                 C   s   t  }|d k	rt  nd }| D ]P}	|d|	}
||
d||	< |d k	r|d k	sPt|d|	}||d||	< q|dd}|r|}|rd }n||d}|d k	r|dd}|r|}|rd }q||d}nd }||||fS )N{name}r   r   )r   rT   r   r   )r   r   r   r   r   r   r   r   r   namer   r   Zuntrimmed_pathr   Zuntrimmed2_pathr   rH   rH   rI   r   l  s2    	r   )r   r   rc   c                 C   s   | dk	od| k}|dk	r,|d|kkr,t d| dk	oZ|dk	oZd| koZd| koZd|koZd|k}|rl|rlt d|rtdS |r|dS d	S dS )
z0Return one of "normal", "combinatorial" or FalseNr   z_When demultiplexing paired-end data, "{name}" must appear in both output file names (-o and -p)r   r   z2You cannot combine {name} with {name1} and {name2}r   r   F)ra   )r   r   ZdemultiplexZdemultiplex_combinatorialrH   rH   rI   r     s.    r   c                 C   s:   t | jp6| jp6| jp6| jp6| jp6| jp6| jp6| jp6| j	S )z>
    Determine whether we should work in paired-end mode.
    )
boolr   interleavedr   r   pair_filterr   r   r   r   rD   rH   rH   rI   determine_paired  s$    r   )r   pairedr   rc   c                 C   s   t | dkrtdn:t | dkrPtdt | d ddd | D  d	 | d }|r|st | d
krvtdq| d
 }nt | dkrtdd }|r||fS |fS d S )Nr   zIYou did not provide any input file names. Please give me something to do!rW   zFYou provided {} input file names, but either one or two are expected. zThe file names were:
 - z
 - c                 s   s   | ]}d | d V  qdS )'NrH   )r   r   rH   rH   rI   r     s     z$setup_input_files.<locals>.<genexpr>zE
Hint: If your path contains spaces, you need to enclose it in quotesrj   zYou used an option that enabled paired-end mode (such as -p, -A, -G, -B, -U), but then you also need to provide two input files (you provided one) or use --interleaved.zIt appears you want to trim paired-end data because you provided two input files, but then you also need to provide two output files (with -o and -p) or use the --interleaved option.)r   ra   formatjoin)r   r   r   Zinput_filenameZinput_paired_filenamerH   rH   rI   setup_input_files  s>    
r   )r   rc   c                 C   s   |s | j rtd| jr td|r| js| js8td| jsFtd| j| j df| j| jdf| j	| j
dffD ]*\}}}t|t|krntdj|d	qn| jd
k rtdd| j  krdksn td| jr| jd
krtdd S )NzQOption --untrimmed-paired-output can only be used when trimming paired-end reads.zFOption --pair-adapters can only be used when trimming paired-end readszWhen a paired-end trimming option such as -A/-G/-B/-U, is used, a second output file needs to be specified via -p (--paired-output).zDWhen you use -p or --paired-output, you must also use the -o option.r   z	too-shortztoo-longzvWhen trimming paired-end data, you must use either none or both of the --{name}-output/--{name}-paired-output options.r   rj   zThe overlap must be at least 1.r   d   z8GC content must be given as percentage between 0 and 100z+--pair-adapters cannot be used with --times)r   ra   r   r   r   r   r   r   r   r   r   r   r   overlap
gc_contenttimes)rD   r   r   Z
paired_outargnamerH   rH   rI   check_arguments  sH    

r  c                   @   s"   e Zd Zdd ZedddZdS )PipelineMakerc                 C   s2   || _ |jdkrd n|j| _|| _|| _|| _d S )Nr   )rD   re   r   rv   r   )rC   rD   r   rv   r   rH   rH   rI   rA   "  s
    zPipelineMaker.__init__rb   c                 C   s  g }| t| jj| jj| j | jjdk	rdt| jj| jj}| jrZ|	|t

|f n
|	| | t| jj| jj| jj| j | t| j| j| j| jj| j| jj| jj| jj | jj	 | jjr| jr|	t tddf n|	t  t| jD ],}| jr |	|t

|f q|	| q| jjrR| jjsJ| jjrRtd| jjr| jjdkrz.| jrt| jjn
t| jj}|	| W n, tk
r } zt|W 5 d}~X Y nX | jr| jj dkrdn| jj }t!||}nt"|}t#|t!r<| jr| js<| jj$s6| jj%s6| jj&r<d|_'dD ]t}t(| j|}	|	dk	r@t)|	}
| jst*|
dkrtd	| jrt*|
d
kr|
d |
d f}
t+|||
 q@| jj,|_,| jj-|_-| jj.|_.| jj/|_/| jj0|_0| jj$|_$|S )z
        Set up a processing pipeline from parsed command-line arguments.

        If there are any problems parsing the arguments, a CommandLineError is raised.

        Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
        NT)ZrevcompzFOption --rename cannot be combined with --prefix (-x) or --suffix (-y)z{header}r   )r   r   rW   z8Two minimum or maximum lengths given for single-end datarj   r   )1extendmake_unconditional_cuttersrD   Zcutr   r   Znextseq_trimr!   quality_baseru   copymake_quality_trimmersZquality_cutoffr   make_adapter_cutterrv   r   r   re   r  r   renamern   Zpoly_ar'   )modifiers_applying_to_both_ends_if_pairedrM   suffixra   r$   r%   r&   r   r-   r,   
isinstancer   r   r   Zoverride_untrimmed_pair_filtergetattrr   r   setattrZmax_nZmax_expected_errorsZmax_average_error_rateZdiscard_casavar   )rC   Z	modifiersZtrimmerZmodifierZrenamerr   Zpair_filter_modepipelineattrZparamZlengthsrH   rH   rI   make)  s     

	






zPipelineMaker.makeN)rO   rP   rQ   rA   r.   r  rH   rH   rH   rI   r  !  s   r  c              
   C   s   t | j| j| j| j| jd}zt| j|}t| j|}W n6 t	t
tfk
rn } zt|jd W 5 d }~X Y nX t| t| | jdkr|| D ]}|  q||fS )N)Z
max_errorsZmin_overlapZread_wildcardsZadapter_wildcardsr{   r   rj   )r   Z
error_rater   Zmatch_read_wildcardsr}   r{   r   rv   r   KeyErrorr   r   ra   rD   r   debugZenable_debug)rD   Zsearch_parametersrv   r   r   ZadapterrH   rH   rI   adapters_from_args  s,     

r  )cut1r   r   c                 c   s   t | |gD ]\}}|sqt|dkr.tdt|dkrV|d |d  dkrVtd|D ]L}|dkrhqZ|dkr|rt|d fV  qt|V  qZ|std t|fV  qZqd S )NrW   z0You cannot remove bases from more than two ends.r   rj   z0You cannot remove bases from the same end twice.)	enumerater   ra   r   r   )r  r   r   iZcut_argcrH   rH   rI   r    s      r  )cutoff1cutoff2r  r   c                 #   s    fdd| |fD }|r`| d k	r<|d kr<t  |d |d< |d d k	sT|d d k	r~t|V  n|d d k	r~|rtt|d V  d S )Nc                    s2   g | ]*}|d k	r*|dkr*t t| f nd qS )N0)r   r   )r   cutoffr  rH   rI   r     s   z)make_quality_trimmers.<locals>.<listcomp>r   rj   )r  r   r   )r  r  r  r   Z	qtrimmersrH   r  rI   r	    s    
r	  )r   r   re   r  r   add_rc_suffixallow_indexc	              
   c   s  |r\|rt dzt| ||}	W n2 tk
rR }
 zt dt|
 W 5 d }
~
X Y nX |	V  nd\}}z(| rxt| |||}|rt||||}W n* tk
r }
 zt |
W 5 d }
~
X Y nX |r|s|r|rt|||rdnd dV  n
||fV  n,|r|rt||rdnd dV  n|V  d S )Nz)Cannot use --revcomp with --pair-adaptersz--pair-adapters: r   z rc)Z	rc_suffix)ra   r    r   strr   r   r(   r#   )rv   r   r   r   re   r  r   r   r!  Zcutterr   Zadapter_cutterZadapter_cutter2rH   rH   rI   r
    s@    "


r
  c                 c   s   | j d k	rt| j V  | jr$t V  | jr6t| jV  | jD ]}t|V  q<| jsX| j	rht
| j| j	V  | jr|t| jdV  d S )Nr  )Zlengthr"   Ztrim_nr   Z
length_tagr   Zstrip_suffixr   rM   r  r   Zzero_capr   r  )rD   r  rH   rH   rI   r    s    

r  c                 C   sJ   t  }|dkrd| d nd}tdtt  | tdd|  dS )	z'Print the "This is cutadapt ..." headerZCPythonz ()rJ   z$This is cutadapt %s with Python %s%szCommand line parameters: %s N)r   Zpython_implementationloggerr   r   python_versionr   )cmdlineargsimplementationZoptrH   rH   rI   
log_header"  s    r)  c                   C   s   t   ttjdd  dS )z#Entry point for command-line scriptrj   Nr   )multiprocessingZfreeze_supportmainrZ   argvrH   rH   rH   rI   main_cli0  s    r-  c                 C   s  t   }t }|j| d\}}tjjsFttt||j	|j
dk|jd t|  t|j}t  |j	rt|j
rt|d |rt|  |dd|  |jdk r|d |jdkrt n|j}t|jt|d	}tj r|j	s|jst }	nt }	t|}
z|jot|j d
k}t!|j |
|}t"||
 t#|\}}t$||
rF|nd t%||
||& }dd |D }dd |D }t'|||||}t(|d|i}t)dddd|j* ||d
krdnd t+|||||	|j,}W n t-k
r   |jr nt.dtjd t/d Y n t0k
r0   t/d
 Y nl t1t2t3t4j5t4j6t7fk
r } z<tjddd td| t8|t7r|dnd
}t/| W 5 d}~X Y nX t   | }|j
dkrt9}nt:}t;t<d||||j=d  |j>dk	rRt?|j>dZ}t@|| |jAd t|jAd
kr|jAd
 nd||
|j=d d }|BtC| |Bd! W 5 Q R X |dk	rddlD}|E  |F|Gd"Hd# |S )$z
    Set up a processing pipeline from the command-line arguments, run it and return
    a Statistics object.

    default_outfile is the file to which trimmed reads are sent if the ``-o``
    parameter is not used.
    r   r   )Zlog_to_stderrquietr   r  z<Options --quiet and --report cannot be used at the same timezunrecognized arguments: r$  r   z$Value for --cores cannot be negative)r   Zthreadsrj   Nc                 S   s   g | ]
}|j qS rH   r   r   arH   rH   rI   r   m  s     zmain.<locals>.<listcomp>c                 S   s   g | ]
}|j qS rH   r   r/  rH   rH   rI   r   n  s     r   z$Processing %s reads on %d core%s ...z
single-endz
paired-end)FTr   rJ   ZInterruptedrU      zCommand line error. Traceback:T)exc_infoz%srW   g      Y@w)statsr'  r   r   coresr   r   rX   time   )Ir6  r   Zparse_known_argsloggingrootZhandlersr6   r%  is_any_output_stdoutr.  reportr  r)  setup_profiler_if_requestedZprofilelog_system_infor_   warn_if_en_dashesr   r5  r3   r2   r   estimate_compression_threadsrZ   r[   isattyr4   r5   r   r   r   r   r   r  r  log_adaptersr  r  r   r0   r   r   r/   Zbuffer_sizeKeyboardInterruptrY   r\   BrokenPipeErrorOSErrorEOFErrorr8   dnaioZUnknownFileFormatZFileFormatErrorra   r  r*   r)   logr7   r   Zjsonopenjson_reportr   write
json_dumpspstatsdisableZStatsZ
sort_statsZprint_stats)r'  r   Z
start_timer   rD   Zleftover_argsprofilerr5  r   Zprogressr   Zis_interleaved_inputZinput_pathsrv   r   r  r   r   ZoutfilesZinpathsr4  r   Z	exit_codeelapsedr;  r   Z	json_dictrL  rH   rH   rI   r+  7  s    




         	
r+  c                   C   s.   t dtj t dtj t dtj d S )NzPython executable: %szdnaio version: %szxopen version: %s)r%  r  rZ   
executablerF  r   r   rH   rH   rH   rI   r=    s    r=  c                 C   s   |d k	}t |rdndt|  t| dD ]}t d| q,t| dkr^t dt| d  |rt dt| t|dD ]}t d| q~t|dkrt dt|d  d S )NzR1 adapters (%d):zAdapters (%d):r7  z- %sz- (%d more)zR2 adapters (%d):)r%  r  r   r   islice)rv   r   r   r0  rH   rH   rI   rA    s    rA  c                 C   s&   | rdd l }| }|  nd }|S )Nr   )cProfileZProfileenable)Z	requestedrR  rN  rH   rH   rI   r<    s    
r<  c                 C   s$   | D ]}| drtd| qd S )Nu   –u   The first character in argument '%s' is '–' (an en-dash, Unicode U+2013) and will therefore be interpreted as a file name. If you wanted to provide an option, use a regular hyphen '-'.)
startswithr%  Zwarning)rD   argrH   rH   rI   r>    s    
r>  )r5  rc   c                 C   s   t dt| d dS )Nr   rj      )maxr<   )r5  rH   rH   rI   r?    s    r?  c                 C   sh   t | jd k| jdk| jdk| jdk| jdk| jdk| jdk| jdk| jdk| j	dk| j
dk| jdkgS )N-)r   r   r   r   r   r   r   r   r   r   r   r   r   rH   rH   rI   r:    s    r:  )r4  r'  r   r   r5  r   r   rc   c              
   C   s@   dt ddgtt |||||dd}|| j|dd |S )NzCutadapt reportr   r|   )r   r   r   )tagZschema_versionZcutadapt_versionr&  Zcommand_line_argumentsr5  inputT)Zone_line)r   r   r   r&  updateZas_json)r4  r'  r   r   r5  r   r   drH   rH   rI   rI    s    

rI  __main__)zr`   r  rZ   r6  r=   r8  r   r   r*  pathlibr   typingr   r   r   r   r   r   r	   r
   r   argparser   r   r   rF  r   Zcutadaptr   Zcutadapt.adaptersr   r   r   Zcutadapt.jsonr   r   rK  Zcutadapt.parserr   Zcutadapt.modifiersr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   Zcutadapt.reportr)   r*   r+   Zcutadapt.pipeliner,   r-   Zcutadapt.runnersr.   r/   Zcutadapt.filesr0   r1   r2   Zcutadapt.utilsr3   r4   r5   Zcutadapt.logr6   r7   Zcutadapt.qualtrimr8   Z	getLoggerr%  r9   	Exceptionra   r   r"  r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r	  r
  r  r)  r-  stdoutbufferr+  r=  rA  r<  r>  r?  r:  r   rI  rO   r\   rH   rH   rH   rI   <module>   s    ,T# v

) 
   
*1|0v
