U
    e*K                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ edddZ dZ!dddddddgZ"edddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4gZ#dZe$ee%e&e$e'eefd7d8d9Z(d[e$ee%e&e$e'eefd7d:d;Z)d\e$ee%e&e$e'e$d=d>d?Z*d@dA Z+dBdC Z,d]dDdEZ-d^dFdGZ.d_dHdIZ/dJdK Z0dLdM Z1dNdO Z2d`dPdQZ3dadRdSZ4dTdU Z5dbdVdWZ6dcdXdYZ7dS )d    N)	HTTPErrorChunkedEncodingErrorConnectionErrorReadTimeout)
ExpatError)parse)	DataFrame)DNAIteratorProteinIterator)DNAProtein)Metadata)OrderedDict)Paralleldelayed)Managerzqiime2-rescriptzb.kaehler@adfa.edu.au)ZtoolZemailK7A`?kingdomphylumclassorderfamilygenusspecies)domainZd__)ZsuperkingdomZsk__)r   Zk__)Z
subkingdomZks__)ZsuperphylumZsp__)r   Zp__)Z	subphylumZps__)ZinfraphylumZpi__)Z
superclassZsc__)r   Zc__)subclassZcs__)Z
infraclassZci__)ZcohortZco__)Z
superorderZso__)r   Zo__)ZsuborderZos__)Z
infraorderZoi__)Z	parvorderZop__)ZsuperfamilyZsf__)r   Zf__)Z	subfamilyZfs__)ZtribeZt__)ZsubtribeZts__)r   Zg__)ZsubgenusZgs__)zspecies groupZss__)zspecies subgroupZsgs__)r   Zs__)Z
subspeciesZssb__)ZformaZfor__T   )queryaccession_idsranksrank_propagationlogging_leveln_jobsreturnc                 C   sr   |d krt }| d kr$|d kr$tdt| |||||d\}}tdd | D }t|dgdj}d|j_||fS )N'Query or accession_ids must be suppliednuccorec                 s   s"   | ]\}}t |d |idV  qdS id)ZmetadataN)r   .0kv r,   Hlib/python3.8/site-packages/rescript-2024.2.0-py3.8.egg/rescript/ncbi.py	<genexpr>V   s     z get_ncbi_data.<locals>.<genexpr>Taxonindex
Feature ID)	_default_ranks
ValueError_get_ncbi_datar	   itemsr   Tr1   namer   r   r   r    r!   r"   seqstaxar,   r,   r-   get_ncbi_dataI   s    
  r<   c                 C   sr   |d krt }| d kr$|d kr$tdt| |||||d\}}tdd | D }t|dgdj}d|j_||fS )Nr$   Zproteinc                 s   s"   | ]\}}t |d |idV  qdS r&   )r   r(   r,   r,   r-   r.   j   s    z(get_ncbi_data_protein.<locals>.<genexpr>r/   r0   r2   )	r3   r4   r5   r
   r6   r   r7   r1   r8   r9   r,   r,   r-   get_ncbi_data_protein]   s    
  r=   r%   )r   r   r   r    r!   r"   dbc                 C   s   t  }| }| r(t| |||t|\}	}
|r| }| rx|	rx||	  }|rt||||t|\}}|	| |
| nt||||t|\}	}
t|
|||||t\}}|D ]
}|	|= q|	|fS N)	r   ZLockget_data_for_query_entrez_delayZget_idskeysget_data_for_accsupdateget_taxonomies)r   r   r   r    r!   r"   r>   Zmanagerrequest_lockr:   taxidsaccsZacc_seqsZ
acc_taxidsr;   Zbad_accsaccr,   r,   r-   r5   r   sV             
       r5   c                 C   sV   t d}t|dsRt d}t tj}|| || | rL|	|  d|_
|S )NZrescriptisSetUpz8%(levelname)s:%(asctime)-15s:%(processName)s:%(message)sT)loggingZ	getLoggerhasattrZ	FormatterZStreamHandlersysstdoutZsetFormatterZ
addHandlerZsetLevelrJ   )r!   loggerZ	formatterZhandlerr,   r,   r-   _get_logger   s    




rP   c                 G   sv  d}d}d}dddddd	g}t tttf}t|D ] }z| | W   S  tk
r }	 zL|	jjdkrn|d
 n,|	jj|kr|dt	|	jj d  n|	|	}
W 5 d }	~	X Y n t
k
r }	 z$t	|	dkr|d n|	|	}
W 5 d }	~	X Y nL |k
r8 }	 z,|dt|	j d t	|	 d  |	}
W 5 d }	~	X Y nX tt|d|  | q0t
dt|
j d t	|
 d S )N
   r   x   i  i  i  i  i  i  zRequest failed with code 400. This could be because all of the requested accession ids are invalid, or it could just be a temporary failure. Retrying.zRequest failed with code z. Retrying.
bad recordzRequest failed. Retrying.zRequest failed with exception
: z

Retrying.   zmMaximum retries (10) exceeded for HTTP request. Persistent trouble downloading from NCBI. Last exception was
)r   r   r   r   ranger   ZresponseZstatus_codedebugstrRuntimeErrortype__name__timesleepmin)Zhttp_requestrO   argsZmax_retriesZbackoff_factorZmax_backoffZstatus_forcelistZexception_forcelistZretryeZlast_exceptionr,   r,   r-   
_robustify   sf       
ra   c                    sT   t |dkstdd| d d|d t| fdd}t|| S )	Nr   zneed at least one idz8https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgir>   ,)r>   r'   c              	      s      d t ztj tddd}W 5   d X |  t	|j
d }d|krt|d tr|d D ]}| q~n|d  d|krtd	t| } |d | d< |d
 | d< | S )Nrequest lock acquiredrequest lock releasedrQ   T)dataparamstimeoutstreamZePostResultZERRORWebEnvzNo data for given idsQueryKey	query_key)acquirerW   r\   r]   releaserequestsZpost_entrez_paramsraise_for_statusr   content
isinstancelistwarningr4   dict)rf   rwebenverrorre   entrez_delayZepostrO   rF   r,   r-   request   s4    

   
z_epost.<locals>.request)lenAssertionErrorjoinrP   ra   )rf   idsrF   r!   rz   r{   r,   ry   r-   _epost   s    r   c                    s"   d fdd}t |t|| S )Nz:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgic                    sv   t   tj| dd}|  t|jd }d|kr@tdtf ddd|d |d	 d
t	} t
|d }| |fS )NrQ   )rf   rg   ZeSearchResultri   zNo sequences for given queryr%   fastaxmlrj   )r>   rettyperetmoderi   rk   ZCount)r\   r]   rn   getrp   r   rq   r4   ru   ro   int)rf   rv   rw   expected_num_recordsrz   Zesearchr,   r-   r{      s"    
  z_esearch.<locals>.request)ra   rP   )rf   r!   rz   r{   r,   r   r-   _esearch   s    r   c                    s2   d dd< t | fdd}t|S )Nz9https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi  retmaxc               	      s      d t ztj ddd} W 5   d X |   t| j	}t
|  }t
|  }t|t
r|D ](}t|tsdt|  tdq|S t|tr|gS dt|  tdd S )Nrc   rd   rQ   T)rf   rg   rh   zbad record:
rS   )rl   rW   r\   r]   rm   rn   r   rp   r   rq   rs   valuespoprr   ru   rX   rY   )rv   re   recZefetchrz   rO   rf   rF   r,   r-   r{     s*    






z_efetch_5000.<locals>.request)rP   ra   )rf   rF   r!   rz   r{   r,   r   r-   _efetch_5000   s
    r   c                 C   s   t | d d S )Na  This query could result in more than 100 requests to NCBI. If you are not running it on the weekend or between 9 pm and 5 am Eastern Time weekdays, it may result in NCBI blocking your IP address. See https://www.ncbi.nlm.nih.gov/home/about/policies/ for details.)rP   rt   )r!   r,   r,   r-   _large_warning!  s    r   c                 C   s   dt t|  d t t| d }ddg}t }|D ](}|D ]}||kr>|||   q6q>q6tt| | }t|dkr|d7 }n|d7 }|d	|d d d
 7 }|S )NzPartial download. Expected z records, but got .TSeq_accverTaxIdrQ   z*
More than 10 ids were missing. Ten were: z!
The following ids were missing: , z.
)rX   r|   setaddrs   r~   )r   re   rx   Zid_keysZgottenrecordZid_keyZungottenr,   r,   r-   _ungotten_ids)  s(    

r   c           
      C   sv   t || |||}t||||}t|}t| t|krXt| |}	|rNt|	n
||	 |dtt| d  |S )Ngot  records)	r   r   rP   r|   r   rY   rt   inforX   )
Z	ids_chunkrf   rF   r!   raise_on_partialrz   Zchunk_params
data_chunkrO   rx   r,   r,   r-   _get_id_chunk<  s*           


r   c                    st   t }|dtt d  tt|dd}| fddtdtdD }	d	d
 |	D }
|
S )NDownloading r   lokyr"   Zbackendc                 3   s0   | ](}t t||d    V  qdS )r   N)r   r   r)   chunkrz   r   r!   rf   r   rF   r,   r-   r.   T  s     z_get_for_ids.<locals>.<genexpr>r   r   c                 S   s   g | ]}|D ]}|qqS r,   r,   r)   Zchunksr   r,   r,   r-   
<listcomp>X  s       z _get_for_ids.<locals>.<listcomp>)rP   r   rX   r|   rs   r   rV   )rf   r   r!   r"   rF   r   rz   rO   parallelchunkyre   r,   r   r-   _get_for_idsN  s    r   c                 C   sx   t | dkrt| tf |dddt}t|| |||d|}i }i }	|D ]$}
|
d ||
d < |
d |	|
d < qJ||	fS )	NiH r   r   )r>   r   r   TTSeq_sequencer   
TSeq_taxid)r|   r   ru   ro   r   )rH   r!   r"   rF   rz   r>   rf   recordsr:   rG   r   r,   r,   r-   rC   ]  s&       rC   c                 C   s   | |d< t ||||}t|td||  krdt|}|dttd||   d tt| d  t|}|dtt| d  |S )NZretstartr   z	Expected z" sequences in this chunk, but got z4. I do not know why, or which sequences are missing.r   
 sequences)r   r|   r^   rP   warnrX   r   )r   rf   rz   r   rF   r!   r   rO   r,   r,   r-   _get_query_chunkn  s$    
r   c                    sF  t f || dddtt \dkr8t g }t}|dt d  t|dd}| fd	d
tddD }	dd |	D }i }
i }|D ]}z$|d |
|d < |d ||d < W q t	k
r: } zLt|dkr(|d |
|d < |d ||d < |
d|d  d  n W 5 d }~X Y qX q|
|fS )Nyr   )r>   ZtermZ
usehistoryr   i
 r   r   r   r   c                 3   s$   | ]}t t| V  qd S r?   )r   r   r   rz   r   r!   rf   rF   r,   r-   r.     s    z%get_data_for_query.<locals>.<genexpr>r   c                 S   s   g | ]}|D ]}|qqS r,   r,   r   r,   r,   r-   r     s       z&get_data_for_query.<locals>.<listcomp>r   r   r   z'TSeq_accver'ZTSeq_sidzUsing zR as a sequence identifier, because it did not come down with an accession version.)ru   ro   r   r   rP   r   rX   r   rV   KeyErrorrt   )r   r!   r"   rF   rz   r>   r   rO   r   r   r:   rG   r   r`   r,   r   r-   r@     sH       
r@   c                    sZ  t f ddit}ttt|  }t|||||d|}	i }
|	D ]p}z|r>td|d fg  dd t	D  |d d	 D ] }|d
 t	kr|d  |d
 < q|d } d r|
 d d r|t d d d  }n"d|kr|dd\}}| d< | d<  d } D ]$} | d kr0| |<  | }qndd |D  d|krd|d  d< nd|krz|d  d< |d d	 D ]}|d  |d
 < q|d }d|kr
 d r|
 d d r
|t d d d  }n"d|kr
|dd\}}| d< | d<  |
|d < d|krH|d D ]} |
|d d < q0W q@ ttfk
r } zBt|}|dt|j d t| d tj|dd d  W 5 d }~X Y q@X q@g }t }i }|  D ]R\}}||
kr|
|  d fdd|D }|||< n|| || q|rRt|}|dd| d  d| d!  ||fS )"Nr>   taxonomyFZNCBI_DivisionZDivisionc                 s   s   | ]}|d fV  qd S r?   r,   r)   rv   r,   r,   r-   r.     s     z!get_taxonomies.<locals>.<genexpr>Z	LineageExr/   ZRankZScientificNamer    r   r   c                 S   s   i | ]
}|d qS ) r,   r   r,   r,   r-   
<dictcomp>  s      z"get_taxonomies.<locals>.<dictcomp>r   r   r   Z	AkaTaxIdszGot exception
rT   z
from taxon
)indentz
Skipping this taxon.z; c                    s   g | ]}t |  |  qS r,   )_allowed_ranksr   r   r,   r-   r     s     z"get_taxonomies.<locals>.<listcomp>ztThe following accessions were deleted from the sequence database because there was a problem with their taxonomies: r   z.
The problematic taxids were: r   )ru   ro   r   maprX   r   r   r   rD   r   
startswithr|   splitr   	TypeErrorrP   rt   rZ   r[   jsondumpsr6   r~   appendr   )rG   r   r    r!   r"   rF   rz   rf   r   r   r;   r   Zrankr   r   Z
last_labelZakaTaxIdr`   rO   Zmissing_accsZmissing_taxidsZtax_stringsrI   ZtaxidZtsr,   r   r-   rE     s     










rE   )NNNTNr   )NNNTNr   )NNNTNr   r%   )r   )r   )r   )r   )r   r%   )r   r%   )r   )8rM   r\   rK   r   rn   Zrequests.exceptionsr   r   r   r   Zxml.parsers.expatr   Z	xmltodictr   Zpandasr   Zq2_types.feature_datar	   r
   Zskbior   r   Zqiime2r   collectionsr   Zjoblibr   r   Zmultiprocessingr   ru   ro   rA   r3   r   rX   rs   boolr   r<   r=   r5   rP   ra   r   r   r   r   r   r   r   rC   r   r@   rE   r,   r,   r,   r-   <module>	   s         #                                  "'
"

" 
   
   
( 