U
    \)_f!                     @   s   d Z ddlmZ ddlmZmZmZ ddlmZ	 ddl
mZmZmZ deee edd	d
Zeeeeeeef dddZdeee edddZdeeee dddZdeeeeeeef  dddZdS )a  
Chunked reading of FASTA and FASTQ files

This can be used to very quickly split up the input file into similarly-sized chunks,
without actually parsing the records. The chunks can then be distributed to worker threads
or subprocess and be parsed and processed there.
    )	RawIOBase)OptionalIteratorTuple   )paired_fastq_heads)FileFormatErrorFastaFormatErrorUnknownFileFormatN)bufendreturnc                 C   sx   |  dd|}|dkr|d S | dd dks>| dd dkrBdS t| dkrRdS t| d }tdt| dd	dS )
z
    Search for the end of the last complete FASTA record within buf[:end]

    Return an integer length such that buf[:length] contains the highest
    possible number of complete FASTA records.
       
>r   r      >   #z1FASTA file expected to start with '>', but found Nline)rfindlenchrr	   repr)r   r   posc r   +lib/python3.8/site-packages/dnaio/chunks.py_fasta_head   s     r   )buf1buf2end1end2r   c           	      C   s   |dks|dkrdS |dkr,| dd dksD|dkrP|dd dkrPt ddd| dd|}|dd|}t||}d }}|dkr| d||d }|d||d }|d8 }q~||fS )	z
    Return positions pos1, pos2 where right1 <= end1 and right2 <= end2
    such that buf1[:pos1] and buf2[:pos2] contain the same number of complete FASTA
    records.
    r   )r   r   Nr   r   z%FASTA file expected to start with '>'r   r   )r	   countminfind)	r   r   r   r    Z
n_records1Z
n_records2Z	n_recordsZpos1Zpos2r   r   r   _paired_fasta_heads%   s    0

r$   c                 C   s>   |  dd|}|}t|d d D ]}| dd|}q"|d S )z
    Search for the end of the last complete *two* FASTQ records in buf[:end].

    Two FASTQ records are required to ensure that read pairs in interleaved
    paired-end data are not split.
       
r      r   )r!   ranger   )r   r   Z
linebreaksright_r   r   r   _fastq_headA   s
    r*     @ )fbuffer_sizer   c                 c   sB  t |}| t|dd }|dkr*dS |dks6t|dd dkrLt}n@|dd dksl|dd dkrrt}ntdtt|d  |t	|krt
d| t||d | }||krƐq"|||}||kst|dkrt|d| V  || }|dkst||| |d|< q|dkr>t|d| V  dS )	aE  
    Read chunks of complete FASTA or FASTQ records from a file.
    If the format is detected to be FASTQ, all chunks except possibly the last contain
    an even number of records such that interleaved paired-end reads remain in sync.
    The yielded memoryview objects are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    Arguments:
        f: File with FASTA or FASTQ reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        memoryview representing the chunk. This becomes invalid on the next iteration.

    Raises:
         ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
         UnknownFileFormat: The file format could not be detected
           (the first byte must be "@", ">" or "#")
    r   r   N   @r   r   zZCannnot determine input file format: First character expected to be '>' or '@', but found z+FASTA/FASTQ record does not fit into buffer)	bytearrayreadinto
memoryviewAssertionErrorr*   r   r
   r   r   r   OverflowError)r,   r-   r   startheadZbufendr   r   r   r   read_chunksR   s4     

r6   )r,   f2r-   r   c                 c   s  |dk rt dt|}t|}| t|dd }|t|dd }|dkrl|dkrltdtdfS |dk|dkkr|dkrdnd}td| dd	d
|d	d d  kr|d	d   krdkrn ntdd	d
|d	d dkrd}t}	n6|d	d dkrd}t}	ntd|d	d  d	d
|t|krX|t|krXt d| | t||d	 | }
|t||d	 | }||
kr||krq|	|||
|\}}||
kst||kst|dks|dks|dkrt|d| t|d| fV  nb|dkr(|dks,td}|
dksD|dkrb|
dkrRdnd}d| d}td| dd	d
|
| }|dkst|||
 |d|< || }|dkst||| |d|< q.|dks|dkrt|d| t|d| fV  d	S )a+  
    Read chunks of paired-end FASTA or FASTQ records from two files.
    A pair of chunks (memoryview objects) is yielded on each iteration,
    and both chunks are guaranteed to have the same number of sequences.
    That is, the paired-end reads will stay in sync.

    The memoryviews are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    This is similar to `read_chunks`, but for paired-end data.

    Args:
        f: File with R1 reads; must have been opened in binary mode
        f2: File with R2 reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        Pairs of memoryview objects.

    Raises:
         ValueError: A FASTA or FASTQ record was encountered that is larger than *buffer_size*.
       zBuffer size too smallr   r          z)Paired-end reads not in sync: File with Rz$ reads is empty and the other is notNr   r.   zAPaired-end data must be in FASTQ format when using multiple coresZFASTQr   ZFASTAzLFirst character in input file must be '@' (FASTQ) or '>' (FASTA), but found z3FASTA/FASTQ records do not fit into buffer of size  z. File z- ended, but more data found in the other filez!Premature end of paired-end input.)	
ValueErrorr/   r0   r1   r   _paired_fastq_headsr$   r   r2   )r,   r7   r-   r   r   Zstart1Zstart2iZfile_formatZpaired_headsZbufend1Zbufend2r   r    Zextrar   r   r   read_paired_chunks   sv    
2$
 r@   )N)N)r+   )r+   )__doc__ior   typingr   r   r   Z_corer   r>   
exceptionsr   r	   r
   bytesintr   r$   r*   r1   r6   r@   r   r   r   r   <module>   s(      
K 