U
     e                     @   s   d Z ddlmZ ddlmZmZmZ ddlmZ	 ddl
mZmZmZ deee edd	d
Zdeee edddZdeeee dddZdeeeeeeef  dddZdS )a  
Chunked reading of FASTA and FASTQ files

This can be used to very quickly split up the input file into similarly-sized chunks,
without actually parsing the records. The chunks can then be distributed to worker threads
or subprocess and be parsed and processed there.
    )	RawIOBase)OptionalIteratorTuple   )paired_fastq_heads)FileFormatErrorFastaFormatErrorUnknownFileFormatN)bufendreturnc                 C   sx   |  dd|}|dkr|d S | dd dks>| dd dkrBdS t| dkrRdS t| d }tdt| dd	dS )
z
    Search for the end of the last complete FASTA record within buf[:end]

    Return an integer length such that buf[:length] contains the highest
    possible number of complete FASTA records.
    s   
>r   r      >   #z1FASTA file expected to start with '>', but found Nline)rfindlenchrr	   repr)r   r   posc r   +lib/python3.8/site-packages/dnaio/chunks.py_fasta_head   s     r   c                 C   s>   |  dd|}|}t|d d D ]}| dd|}q"|d S )z
    Search for the end of the last complete *two* FASTQ records in buf[:end].

    Two FASTQ records are required to ensure that read pairs in interleaved
    paired-end data are not split.
       
r      r   )countranger   )r   r   Z
linebreaksright_r   r   r   _fastq_head%   s
    r"     @ )fbuffer_sizer   c                 c   sB  t |}| t|dd }|dkr*dS |dks6t|dd dkrLt}n@|dd dksl|dd dkrrt}ntdtt|d  |t	|krt
d| t||d | }||krƐq"|||}||kst|dkrt|d| V  || }|dkst||| |d|< q|dkr>t|d| V  dS )	aE  
    Read chunks of complete FASTA or FASTQ records from a file.
    If the format is detected to be FASTQ, all chunks except possibly the last contain
    an even number of records such that interleaved paired-end reads remain in sync.
    The yielded memoryview objects are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    Arguments:
        f: File with FASTA or FASTQ reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        memoryview representing the chunk. This becomes invalid on the next iteration.

    Raises:
         ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
         UnknownFileFormat: The file format could not be detected
           (the first byte must be "@", ">" or "#")
    r   r   N   @r   r   zZCannnot determine input file format: First character expected to be '>' or '@', but found z+FASTA/FASTQ record does not fit into buffer)	bytearrayreadinto
memoryviewAssertionErrorr"   r   r
   r   r   r   OverflowError)r$   r%   r   startheadZbufendr   r   r   r   read_chunks6   s4     

r.   )r$   f2r%   r   c                 c   sN  |dk rt dt|}t|}| t|dd }|t|dd }|dkrd|dd dks||dkr|dd dkrtddd|t|kr|t|krt d	| | t||d | }|t||d | }||kr||krqt||||\}	}
|	|kst|
|ks$t|	dks8|
dkr\t|d|	 t|d|
 fV  nb|	dkrp|
dksttd
}|dks|dkr|dkrdnd}d| d}td| ddd||	 }|dkst||	| |d|< ||
 }|dkst||
| |d|< q|dks(|dkrJt|d| t|d| fV  dS )aR  
    Read chunks of paired-end FASTQ reads from two files.
    A pair of chunks (memoryview objects) is yielded on each iteration,
    and both chunks are guaranteed to have the same number of sequences.
    That is, the paired-end reads will stay in sync.

    The memoryviews are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    This is similar to `read_chunks`, but for paired-end data.
    Unlike `read_chunks`, this only works for FASTQ input.

    Args:
        f: File with R1 reads; must have been opened in binary mode
        f2: File with R2 reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        Pairs of memoryview objects.

    Raises:
         ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
       zBuffer size too smallr   r   r&   zAPaired-end data must be in FASTQ format when using multiple coresNr   z-FASTQ records do not fit into buffer of size     z. File z- ended, but more data found in the other filez#Premature end of paired FASTQ input.)
ValueErrorr'   r(   r)   r   r   _paired_fastq_headsr*   )r$   r/   r%   Zbuf1Zbuf2Zstart1Zstart2Zbufend1Zbufend2Zend1Zend2Zextrair   r   r   read_paired_chunks~   sR    0$
 r7   )N)N)r#   )r#   )__doc__ior   typingr   r   r   Z_corer   r5   
exceptionsr   r	   r
   bytesintr   r"   r)   r.   r7   r   r   r   r   <module>   s   K 