
    瞤dۀ                        d dl mZ d dl mZ d dlZd dlZ	 d dlZn# e$ r d dlZY nw xY wd dlm	Z	m
Z
 d dlmZ d dlZd dlZd dlZd dlZd dlmZ d dlZddgZd	Zej                            ej                            d
d          dd          ZefdZ G d de          Zd Zd ZddZ d Z!e"dk    r e            Z#e#$                    d          Z% e&e%            e&e#'                    e%                      e&e#(                    e%                     e#$                    ddd          Z) e&e#*                    e)                      e&e#'                    e)                     dS dS )    )absolute_import)print_functionN)defaultdictCounter)md5)mapNCBITaxais_taxadb_up_to_date   HOME/z.etetoolkitztaxa.sqlitec                 &   t          j        |           }	 |                    d          }|                                d         }n*# t           j        t
          t          t          f$ r d}Y nw xY w|                                 |t          k    rdS dS )z|Check if a valid and up-to-date taxa.sqlite database exists

    If dbfile= is not specified, DEFAULT_TAXADB is assumed
    zSELECT version FROM stats;r   NFT)
sqlite3connectexecutefetchoneOperationalError
ValueError
IndexError	TypeErrorclose
DB_VERSION)dbfiledbrversions       <lib/python3.11/site-packages/ete3/ncbi_taxonomy/ncbiquery.pyr
   r
   H   s    
 
	 	 BJJ344**,,q/$j*iH    HHJJJ*u4s   /A $A-,A-c                       e Zd ZdZddZddZd Zd Zdd	Zd
 Z	d Z
d Zd ZddZd Zd ZddZddZddZd ZddZdS )r	   zf
    versionadded: 2.3

    Provides a local transparent connector to the NCBI taxonomy database.
    NTc                 ^   |st           | _        n|| _        |r|                     |           |t           k    rTt          j                            | j                  s0t          dt          j                   |                     |           t          j                            | j                  st          d| j        z            d | _
        |                                  t          | j                  s4|r4t          dt          j                   |                     |           d S d S d S )Nz0NCBI database not present yet (first time used?)filez!Cannot open taxonomy database: %sz+NCBI database format is outdated. Upgrading)DEFAULT_TAXADBr   update_taxonomy_databaseospathexistsprintsysstderrr   r   _connectr
   )selfr   taxdump_fileupdates       r   __init__zNCBITaxa.__init__d   s     	!(DKK DK 	8)),777^##BGNN4;,G,G#D3:VVVV)),777w~~dk** 	P@4;NOOO#DK00 	8V 	8?cjQQQQ)),77777	8 	8 	8 	8    c                 `    |st          | j                   dS t          | j        |           dS )zUpdates the ncbi taxonomy database by downloading and parsing the latest
        taxdump.tar.gz file from the NCBI FTP site (via HTTP).

        :param None taxdump_file: an alternative location of the taxdump.tax.gz file.
        N)	update_dbr   )r+   r,   s     r   r#   z!NCBITaxa.update_taxonomy_database|   s:      	1dk"""""dk<00000r/   c                 B    t          j        | j                  | _        d S N)r   r   r   r   )r+   s    r   r*   zNCBITaxa._connect   s    /$+..r/   c                    t          t          t          t          |                              }dd                    t          t
          |                    z  }| j                            |          }i }|                                D ]h\  }}|	                    t          |                     |
                    t          |                     t          |          |t          |          <   i||fS )Nz?select taxid_old, taxid_new FROM merged WHERE taxid_old IN (%s),)setlistr   intjoinstrr   r   fetchalldiscardadd)r+   
all_taxidsconv_all_taxidscmdresult
conversionoldnews           r   _translate_mergedzNCBITaxa._translate_merged   s    tCZ$8$899;;OQTQYQYZ]^acmZnZnQoQoo%%
)) 	, 	,HC##CHH---C)))#&s88Js3xx  
**r/   ?c                    ddl m} |                    | j                  }|                    d           t
          j                            t
          j                            t                              d         }|
                    dt
          j                            |d          z             t          d|z             t          j        t          |          d|z
  z            }d|d	|d
}ddt          |          }
}	}|
                    |          }	 |                                \  }}	}
t#          |          }n^# t$          $ rQ d|d|d
}|
                    |          }	 |                                \  }}	}
t#          |          }n#  Y nxY wY nw xY wdt'          |
          t          |          z  z
  }|rt          d|	d|d|
d|d	           ||	|fS )z
        Given an inexact species name, returns the best match in the NCBI database of taxa names.

        :argument 0.9 sim: Min word similarity to report a match (from 0 to 1).

        :return: taxid, species-name-match, match-score
        r   NTzselect load_extension('%s')z%SQLite-Levenshtein/levenshtein.sqlextzTrying fuzzy search for %s   z+SELECT taxid, spname, LEVENSHTEIN(spname, "z#") AS sim  FROM species WHERE sim<=z ORDER BY sim LIMIT 1;z#") AS sim  FROM synonym WHERE sim<=z
FOUND!    z taxid:z score:z ())sqlite3.dbapi2dbapi2r   r   enable_load_extensionr$   r%   splitrealpath__file__r   r9   r'   mathceillenr   r8   r   float)r+   namesimrK   _dbmodule_pathmaxdiffsr@   taxidspnamescorerA   
norm_scores                r   get_fuzzy_name_translationz#NCBITaxa.get_fuzzy_name_translation   sT    	('''''nnT[))!!$'''gmmBG$4$4X$>$>??B1BGLLAh5j 5j j 	k 	k 	k 	*T12229SYY!C%011 }A  }A  }A  CK  CK  CK  L#T3t99uvS!!	#)??#4#4 E65 JJEE  	# 	# 	# 	# AE  AE  AE  GO  GO  GO  PC[[%%F#'-'8'8$vu E

	# %,,s4yy01
 	]E&&&%%%PZPZPZ[\\\fj((s*   E (F!/FF!FF! F!c                 6   t          |          }|                    d           |                    d           d                    d |D                       }d|z  }| j                            |          }i }|                                D ]
\  }}|||<   |S )z[return a dictionary converting a list of taxids into their corresponding NCBI taxonomy rankN r5   c                     g | ]}d |z  S z"%s" .0vs     r   
<listcomp>z%NCBITaxa.get_rank.<locals>.<listcomp>       555&!)555r/   z4select taxid, rank FROM species WHERE taxid IN (%s);)r6   r<   r9   r   r   r;   )	r+   taxidsall_idsqueryr@   rA   id2ranktaxrZ   s	            r   get_rankzNCBITaxa.get_rank   s     f++55W55566DeK%%!??,, 	" 	"KC!GCLLr/   c                    t          |          }|                    d           |                    d           d                    d |D                       }| j                            d|z            }i }|                                D ]J\  }}t          t          t          t          |
                    d                                        ||<   K|S )Given a valid taxid number, return its corresponding lineage track as a
        hierarchically sorted list of parent taxids.
        Nr_   r5   c                     g | ]}d |z  S ra   rb   rc   s     r   rf   z3NCBITaxa.get_lineage_translator.<locals>.<listcomp>   rg   r/   z5SELECT taxid, track FROM species WHERE taxid IN (%s);)r6   r<   r9   r   r   r;   r7   r   r8   reversedrM   )r+   rh   ri   rj   rA   id2lineagesrl   tracks           r   get_lineage_translatorzNCBITaxa.get_lineage_translator   s     f++55W55566!XZ_!_`` //++ 	J 	JJC#CXekk#6F6F-G-G$H$HIIKr/   c                 L   |sdS t          |          }| j                            d|z            }|                                }|s|                     |g          \  }}||v r7| j                            d||         z            }|                                }|st          d|z            t          j        d|d||                    t          t          t           |d         
                    d                              }t          t          |                    S )ro   Nz(SELECT track FROM species WHERE taxid=%sz%s taxid not foundztaxid z was translated into r   r5   )r8   r   r   r   rE   r   warningswarnr7   r   rM   rq   )r+   rY   rA   	raw_track_merged_conversionrs   s          r   get_lineagezNCBITaxa.get_lineage   s"     	4E

!KU!RSSOO%%	 	d#'#9#95'#B#B A ))))SUfglUm)mnn"OO--	 d !5u!<===%%%IZ[`IaIabcccSil00556677HUOO$$$r/   c                     d                     d |D                       }d|z  }| j                            |          }i }|                                D ]\  }}|r|||<   |S )Nr5   c                     g | ]}d |z  S ra   rb   rc   s     r   rf   z-NCBITaxa.get_common_names.<locals>.<listcomp>   s    444&!)444r/   z6select taxid, common FROM species WHERE taxid IN (%s);)r9   r   r   r;   )r+   rh   rj   r@   rA   id2namerl   common_names           r   get_common_nameszNCBITaxa.get_common_names   sy    44V44455FM%% & 1 1 	+ 	+C +*r/   c                    t          t          t          |                    }|                    d           |                    d           d                    d |D                       }d|z  }| j                            |          }i }|                                D ]
\  }}	|	||<   t          |          t          |          k    r|r|t          |	                                          z
  }
| 
                    |
          \  }}d t          j        |          D             }|rcd                    d |D                       }d|z  }| j                            |          }|                                D ]\  }}	|	|||         <   |S )zhGiven a list of taxids, returns a dictionary with their corresponding
        scientific names.
        Nr_   r5   c                     g | ]}d |z  S ra   rb   rc   s     r   rf   z1NCBITaxa.get_taxid_translator.<locals>.<listcomp>
  rg   r/   z6select taxid, spname FROM species WHERE taxid IN (%s);c                     i | ]\  }}||	S rb   rb   )rd   kre   s      r   
<dictcomp>z1NCBITaxa.get_taxid_translator.<locals>.<dictcomp>  s    >>>!q!>>>r/   c                     g | ]}d |z  S ra   rb   rc   s     r   rf   z1NCBITaxa.get_taxid_translator.<locals>.<listcomp>  s    !=!=!=&!)!=!=!=r/   )r6   r   r8   r<   r9   r   r   r;   rR   keysrE   six	iteritems)r+   rh   try_synonymsri   rj   r@   rA   r~   rl   rZ   not_found_taxidsold2newnew2olds                r   get_taxid_translatorzNCBITaxa.get_taxid_translator  s   
 c#v&&''55W55566FM%%!??,, 	" 	"KC!GCLL w<<3w<<''L'&W\\^^)<)<<"445EFFOFG>>s}W'='=>>>G 3!=!=W!=!=!=>>NPUU--#)??#4#4 3 3KC,2GGCL))r/   c                 z   i }i }|D ]}|||                                 <   t          |                                          }d                    d t	          j        |          D                       }d|z  }| j                            d|z            }|                                D ]H\  }}	||                                          }
|	                    |
g           
                    |	           I|t          d |                                D                       z
  }|rd                    d |D                       }| j                            d|z            }|                                D ]H\  }}	||                                          }
|	                    |
g           
                    |	           I|S )z
        Given a list of taxid scientific names, returns a dictionary translating them into their corresponding taxids.

        Exact name match is required for translation.
        r5   c                     g | ]}d |z  S ra   rb   rd   ns     r   rf   z0NCBITaxa.get_name_translator.<locals>.<listcomp>/  s    III&!)IIIr/   z6select spname, taxid from species where spname IN (%s)c                 6    g | ]}|                                 S rb   )lowerr   s     r   rf   z0NCBITaxa.get_name_translator.<locals>.<listcomp>6  s     BBBa		BBBr/   c                     g | ]}d |z  S ra   rb   r   s     r   rf   z0NCBITaxa.get_name_translator.<locals>.<listcomp>8  s    999Afai999r/   z6select spname, taxid from synonym where spname IN (%s))r   r6   r   r9   r   iterkeysr   r   r;   
setdefaultappend)r+   namesname2idname2orignamer   rj   r@   rA   sprY   onamemissings               r   get_name_translatorzNCBITaxa.get_name_translator   s     	) 	)A'(M!''))$$M&&(())IIS\--H-HIIIJJFM!Y[`!`aa** 	8 	8IB!"((**-Eub))0077773BB7<<>>BBBCCC 	<HH99999::EW__%]_d%deeF#__.. < <	E%bhhjj1""5"--44U;;;;r/   c                     |                      |          }g }|D ]+}|                    |                    ||                     ,|S )zp
        Given a list of taxid numbers, returns another list with their corresponding scientific names.
        )r   r   get)r+   rh   r~   r   r   s        r   translate_to_nameszNCBITaxa.translate_to_names@  sS     ++F33 	. 	.BLLR,,----r/   Fc                 ,   	 t          |          }nT# t          $ rG 	 |                     |g          |         d         }n # t          $ r t          d|z            w xY wY nw xY w|                     |g          \  }}|r||         }t          | j        dz   d          5 }	t          j        |	          }
ddd           n# 1 swxY w Y   i }d}|
D ]9}||k    r|dz  }|dk    r|	                    |d          dz   ||<   1|dk    r n:|st          d|z            |dk    r|gS |s|s|r| 
                    t          |                                          |||	          }|r|S |r>t          t          t           d
 |                                D                                 S t          t          t           d |D                                 S |rd t          j        |          D             S d t          j        |          D             S )z
        given a parent taxid or scientific species name, returns a list of all its descendants taxids.
        If intermediate_nodes is set to True, internal nodes will also be dumped.

        r   z%s not found!.traverse.pklrbNrH   r   ztaxid not found:%s)intermediate_nodescollapse_subspecies
rank_limitc                     g | ]	}|j         
S rb   rT   r   s     r   rf   z0NCBITaxa.get_descendant_taxa.<locals>.<listcomp>t  s    %M%M%Maf%M%M%Mr/   c                     g | ]	}|j         
S rb   r   r   s     r   rf   z0NCBITaxa.get_descendant_taxa.<locals>.<listcomp>v  s    %;%;%;af%;%;%;r/   c                     g | ]\  }}|S rb   rb   rd   tidcounts      r   rf   z0NCBITaxa.get_descendant_taxa.<locals>.<listcomp>y  s    EEEJCCEEEr/   c                 $    g | ]\  }}|d k    |S rH   rb   r   s      r   rf   z0NCBITaxa.get_descendant_taxa.<locals>.<listcomp>{  s"    SSSJCQR

C


r/   )r8   r   r   KeyErrorrE   openr   pickleloadr   get_topologyr7   r   r   get_descendantsr   r   )r+   parentr   r   r   return_treerY   ry   rB   CACHED_TRAVERSEprepostorderdescendantsfoundr   trees                  r   get_descendant_taxazNCBITaxa.get_descendant_taxaK  s   	:KKEE 	: 	: 	::00&::6B1E : : : &!8999: 	: ..w77: 	&u%E$+o-t44 	8!;77L	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	 	Ce||
!#.??3#:#:Q#>C  !   	158999aZZ7N 	T, 	T 	T$$T+*:*:*<*<%=%=Rd  {N  [e$  f  fD ># >C%M%Md6J6J6L6L%M%M%MNNOOOC%;%;d%;%;%;<<=== 	TEE#-*D*DEEEESS#-*D*DSSSSs7    
A#"A A# AA#"A#!CC	Cc                    ddl m} |                     |          \  }}t          |          dk    rt	          t          |          d                   }t          | j        dz   d          5 }	t          j	        |	          }
ddd           n# 1 swxY w Y   i }d}i }d}t                      }|
                    |          }	 |
                    ||dz             }|
||dz            }n# t          $ r |g}Y nw xY wt          d t          |                                          D                       } |t          |          	          ||<   ||         }|D ]l}||v r||         j        }|                    |            |t          |          	          ||<   |                    ||                    ||vr||         }m||         }nt          t'          t          |                    }i }i }|                     |          }t                      }|                                D ]}|                    |           |                     |          }|D ]}g }||         }|D ]}||vr~|                    | |                      } t          |          | _        || _        |                     d
t          |                    t	          |          d                               n||         } |                    |            |||<   t=          j        |          D ]=\  }}d}!|D ]3}|!r||!j         vr|!                    |           |r|j!        |k    r n|}!4>|d         }|s[|"                                D ]F}"t          |"j                   dk    r,t	          |"j                  |vr|"#                    d           Gt          |j                   dk    r |j         d         $                                }#n|}#|rWg }$|#%                                D ]'} | j!        dk    r|$&                    | j                    (|$D ]}"|"$                                 |r| '                    |#           |#S )a  Given a list of taxid numbers, return the minimal pruned NCBI taxonomy tree
        containing all of them.

        :param False intermediate_nodes: If True, single child nodes
            representing the complete lineage of leaf nodes are kept.
            Otherwise, the tree is pruned to contain the first common
            ancestor of each group.

        :param None rank_limit: If valid NCBI rank name is provided,
            the tree is pruned at that given level. For instance, use
            rank="species" to get rid of sub-species or strain leaf
            nodes.

        :param False collapse_subspecies: If True, any item under the
            species rank will be collapsed into the species upper
            node.

        r   )	PhyloTreerH   r   r   r   Nc                 $    g | ]\  }}|d k    |S r   rb   )rd   re   r   s      r   rf   z)NCBITaxa.get_topology.<locals>.<listcomp>  s"    SSS5QR

!


r/   r   rankzno rankF)prevent_nondicotomicspecies)(r_   r   rE   rR   r8   r7   r   r   r   r   r6   indexr   r   itemsr:   upr=   	add_childr   rt   valuesr-   rm   r   rT   rY   add_featurer   r   r   r   childrenr   r   deletedetachtraverseextendannotate_tree)%r+   rh   r   r   r   annotater   rz   
root_taxidr   r   r   r   nodeshitvisitedstartendsubtreeleavescurrent_parentr   rootsp2track	elem2node
id2lineager>   lineagerk   r   rs   elemnoder   r   r   	to_detachs%                                        r   r   zNCBITaxa.get_topology}  s   & 	!     $($:$:6$B$B!!v;;!T&\\!_--Jdk/1488 <O%{?;;< < < < < < < < < < < < < < <KEECeeG &&z22E'"((U1W==&uSU{3 ' ' '%,' SSGG,<,<,B,B,D,DSSSTTF )	s: ? ? ?E*":.N 4 4'>>%*3Z]NNKK$$$!*C!9!9!9E#J",,U3Z888&(().s$DDS&))**FHI44V<<JJ%,,.. + +!!'****mmJ//G % %$R.# ' 'D9,,(33D))++FF$'II	%)
((W[[TI5V5V1W1WXXXX(LL&&&&$ ]844 " "	E! " "D /$fo"="=((...! di:&=&=!FFQ<D " 	9))++ 9 9qz??a''CKKv,E,EHH%H888t}""=#**,,DDD 	I 4 49	))$$T]333  



 	%t$$$s$   -BBB&C, ,C<;C<rT   c                 h   t                      }|                                D ]L}	 t          t          ||                    }|                    |           6# t
          t          f$ r Y Iw xY wi }	|                     |          \  }}	rD|t          t          t          t          
                                                              z
  r|                     |          |rD|t          t          t          t          |
                                                              z
  r|                     |          }t          d t          |                                          D                       }
|                     t          |
t          
                                          z
                      }                    |           |                     
                                          }|s4|                     t          
                                                    }|                                }|                    d          D ]}	 t          t          ||                    }n# t
          t          f$ r d}Y nw xY w|                    |           |r||	v r|	|         }|                                        |t          ||d                    |                    |d          |                    |g           |                    |d          fd|                    |g           D                        |                                r+|                    t          ||d	          dg dg            8|                     d
 ||         D                       }|d         }|                                        |t-          |                    |                    |d          |||                    |d          fd|D                        ׉||fS )a]  Annotate a tree containing taxids as leaf names by adding the  'taxid',
        'sci_name', 'lineage', 'named_lineage' and 'rank' additional attributes.

        :param t: a Tree (or Tree derived) instance.

        :param name taxid_attr: Allows to set a custom node attribute
            containing the taxid number associated to each node (i.e.
            species in PhyloTree instances).

        :param tax2name,tax2track,tax2rank: Use these arguments to
            provide pre-calculated dictionaries providing translation
            from taxid number and names,track lineages and ranks.
        c                     g | ]	}|D ]}|
S rb   rb   )rd   _lin_taxs      r   rf   z*NCBITaxa.annotate_tree.<locals>.<listcomp>
  s&    ZZZUYZZTtZZZZr/   	postorderN)rY   r_   Unknownc                 V    g | ]%}                     |t          |                    &S rb   r   r:   rd   rl   tax2names     r   rf   z*NCBITaxa.annotate_tree.<locals>.<listcomp>#  s.    /u/u/uPSS#c((0K0K/u/u/ur/   )sci_namer   r   r   named_lineageNAc                     g | ]	}|j         
S rb   )r   )rd   lfs     r   rf   z*NCBITaxa.annotate_tree.<locals>.<listcomp>+  s    /Q/Q/Qr
/Q/Q/Qr/   c                 V    g | ]%}                     |t          |                    &S rb   r   r   s     r   rf   z*NCBITaxa.annotate_tree.<locals>.<listcomp>2  s.    /_/_/_PSS#c((0K0K/_/_/_r/   )r   r   rY   r   r   r   )r6   r   r8   getattrr=   r   AttributeErrorrE   r   r7   r   r   rt   r   r-   r   rm   get_cached_contentadd_featuresr   is_leaf_common_lineager:   )r+   t
taxid_attrr   	tax2tracktax2rankrh   r   r   rz   all_taxid_codesextra_tax2nametax2common_namen2leaves
node_taxidr   ancestors      `             r   r   zNCBITaxa.annotate_tree  s9     	  	 A '!Z0011 

3 ~.    $($:$:6$B$B!! 	96CChmmoo1F1F(G(G$H$HH 	90088H 	<FSS$y~~7G7G2H2H)I)I%J%JJ 	<33F;;IZZY5E5E5G5G0H0HZZZ[[224#hmmooJ^J^8^3_3_``'''//@@ 	<}}T(--//%:%:;;H''))K(( 	a 	aA" J!7!788

/ " " "!


" NN:N... a!222!2:!>J(,,z71jZ\C]C]*^*^-<-@-@R-P-P)2z2)F)F&.ll:y&I&I/u/u/u/uW`WdWdeoqsWtWt/u/u/u	  w w w w
  a'!Z*F*F-/)+&//1	  3 3 3 3 ../Q/QXa[/Q/Q/QRR"2;(,,xX*O*O-<-@-@2-N-N'/)0&.ll8Y&G&G/_/_/_/_W^/_/_/_  a a a a H,,s#   AA.-A.I::JJc                 V   t          t                    }t          t                    D ]B}t          |          D ]0\  }}||xx         dz  cc<   |                             |           1Cfdt          j        |          D             }|sdgS t          |fd          }|S )NrH   c                 @    g | ]\  }}|t                    k    |S rb   rR   )rd   rY   ocuvectorss      r   rf   z,NCBITaxa._common_lineage.<locals>.<listcomp>>  s.    [[[JE3scRYllGZGZ%GZGZGZr/   r_   c                 .    t          |                    S r3   )min)xposs    r   <lambda>z*NCBITaxa._common_lineage.<locals>.<lambda>B  s    #c!f++ r/   )key)r   r8   r6   	enumerater=   r   r   sorted)	r+   r	  
occurrencere   irY   commonsorted_lineager  s	    `      @r   r   zNCBITaxa._common_lineage6  s     %%
# 	" 	"A%aLL " "55!!!Q&!!!E
q!!!!" \[[[#-
*C*C[[[ 	"4K#F0E0E0E0EFFFN!!r/   c                 2   |s|                                 }t          t                    t                      }|                                D ]r}|j                                        dk    r>||j                 }t          |          D ] \  }}|                             |           !]|                    |           st          t                    }	t                      }
t          j
                  D ]\  }}t          |          dk    r|                    |          }nt          |          d         }|t          ||                   z  |z
  r0|	|                             |           |
                    |           fd|
D             }|	|
|fS )zReturns a list of NCBI lineage names that are not monophyletic in the
        provided tree, as well as the list of affected branches and their size.

        CURRENTLY EXPERIMENTAL

        unknownrH   r   c                 :    g | ]}t          |                   S rb   r  )rd   rl   tax2nodes     r   rf   z0NCBITaxa.get_broken_branches.<locals>.<listcomp>  s%    JJJSc(3-00JJJr/   )r   r   r6   iter_leavesr   r   rY   r  r=   r   r   rR   get_common_ancestorr7   )r+   r   taxa_lineages	n2contentr  leafr   r   rl   broken_branchesbroken_cladesr   r  broken_clade_sizesr  s                 @r   get_broken_brancheszNCBITaxa.get_broken_branchesd  s     	/,,..Is##%%MMOO 	" 	"D}""$$	11'
3"+G"4"4 , ,JE3SM%%d++++, D!!!!%c**=22 	' 	'KC6{{Q..v66faYv.///7: ''++C000!!#&&&JJJJMJJJ/AAAr/   )NNTr3   )rF   )T)FNFF)FNFT)rT   NNN)__name__
__module____qualname____doc__r.   r#   r*   rE   r]   rm   rt   r{   r   r   r   r   r   r   r   r   r"  rb   r/   r   r	   r	   ]   s]        8 8 8 80	1 	1 	1 	1/ / /
+ 
+ 
+() () () ()T     % % %2     <  @  0T 0T 0T 0Tdj j j jZJ- J- J- J-X" " "\!B !B !B !B !B !Br/   c                    ddl m} i }i }i }t                      }i }i }t          d           t                      }|                     d          D ]}	t          |	                                          }	d |	                    d          D             }
|
d         }|
d                                         }|
d	         }|	                    d
          
                    d
          }|dk    r|||<   |dk    r|||<   |t          g d          v rF||                                f}||vr,|                    |           |                    ||f           t          t          |          d           t          t          |          d           t          d           |                     d          D ]}	t          |	                                          }	|	                    d          }
|
d                                         }|
d	                                         } |            }||_        ||         |_        ||v r||         |_        |
d                                         |_        |||<   |||<   t          t          |          d           t          d           |D ]<}|dk    r	||         }||         }||         }|                    ||                    =t          d           ||fS )Nr   )TreezLoading node names...z	names.dmpc                 6    g | ]}|                                 S rb   striprd   _fs     r   rf   z,load_ncbi_tree_from_dump.<locals>.<listcomp>  s     888"288::888r/   |r      rH   "zscientific namezgenbank common name)synonymzequivalent namezgenbank equivalent nameanamorphzgenbank synonymzgenbank anamorph
teleomorphznames loaded.zsynonyms loaded.zLoading nodes...z	nodes.dmpznodes loaded.zLinking nodes...1zTree is loaded.)r_   r(  r6   r'   extractfiler:   decoderM   r   rstriplstripr=   rR   r+  rT   taxnamer   r   r   )tarr(  parent2child	name2nodenode2taxnamesynonyms	name2ranknode2commonunique_nocase_synonymslinefieldsnodename	name_typer9  synonym_key
parentnamer   r   r   r   parent_nodes                        r   load_ncbi_tree_from_dumprI    s    LILuuHIK	
!""" UU,, 2 24;;==!!88

3888!91IOO%%	) ..%%,,S11)))%,L"---$+K!!# ` ` ` a a a a $W]]__5K"888&**;777h0111	#l

_---	#h--+,,,	
,,    4;;==!!**S//!9??$$AY__&&
DFF *	{""'1AM""!+X	(	#i../***	
 2 2
#++11 &6"6*;  41111	
h;r/   c                    t          dd          }t          |                                           D ]\  }}|dz  dk    rt          d|dd           |}g }|r#|                    |j                   |j        }|#|j        rft          d	                    |j        |j        j        |j        t          |d
d          |j
        d                    |          g          |           t          d	                    |j        d|j        t          |d
d          |j
        d                    |          g          |           |                                 d S )Ntaxa.tabwi  r   zgenerating entries... )r   	r   r_   r5   r    )r   r  r   r'   r   rT   r   r9   r9  r   r   r   )r   OUTjr   	temp_noders   s         r   generate_tablerS    s]   
z3

C!**,,'' x x1T6Q;;$q0c::::	 	%LL(((!I  	% 4 	x$))QVQTY	71mUW;X;XZ[Z`bebjbjkpbqbqrssz}~~~~~$))QVRGA}b4Q4QSTSY[^[c[cdi[j[jkllsvwwwwwIIKKKKKr/   c                    t           j                            |           d         }|r3t           j                            |          st          j        |           |s	 ddlm} n# t          $ r	 ddlm} Y nw xY w |d          \  }}t          |d          5 }|
                                                                d         }d d d            n# 1 swxY w Y   d}d}t           j                            d          rt          t          dd                                                                                    }	|	|k    rEd}t          d	t          j        
            |d|           t          dt          j        
           n`t          dt          j        
           nDd}t          dt          j        
            |d|           t          dt          j        
           t#          j	        |d          }
t%          |
          \  }}d |                                D             }t)          j        |t          | dz   d          d           t          d| z             t-          |           t          dd          5 }|                    d                    d |D                                  d d d            n# 1 swxY w Y   t          dd          5 }|
                    d          D ]u}t5          |                                          }d                    d |                    d          d d         D                       }|                    |dz              v	 d d d            n# 1 swxY w Y   	 t9          |            t          j        d           |st          j        d           d S d S #   xY w)Nr   )urlretrievez8https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz.md5r   ztaxdump.tar.gzFr   Tz8Updating taxdump.tar.gz from NCBI FTP site (via HTTP)...r    z3http://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gzzDone. Parsing...z%Local taxdump.tar.gz seems up-to-datez;Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...c                 <    g | ]\  }}t          |j                  S rb   )r8   rT   )rd   postr   s      r   rf   zupdate_db.<locals>.<listcomp>  s$    KKKztTC	NNKKKr/   r   wbr   zUpdating database: %s ...syn.tabrL  
c                 4    g | ]}|d          d|d         S )r   rO  rH   rb   rc   s     r   rf   zupdate_db.<locals>.<listcomp>  s+    EEEq!QqTT2EEEr/   
merged.tabz
merged.dmprO  c                 6    g | ]}|                                 S rb   r*  r,  s     r   rf   zupdate_db.<locals>.<listcomp>  s     !K!K!K"((**!K!K!Kr/   r.  zrm syn.tab merged.tab taxa.tabzrm taxdump.tar.gz)r$   r%   rM   r&   mkdirurllibrU  ImportErrorurllib.requestr   readliner   read	hexdigestr'   r(   r)   tarfilerI  iter_prepostorderr   dumprS  writer9   r5  r:   r6  upload_datasystem)r   
targz_filebasepathrU  md5_filenamery   md5_file	md5_checkdo_download	local_md5r:  r   r>  r   SYNmergedrB  out_lines                     r   r1   r1     s   w}}V$$Q'H x00 
 7	3******* 	3 	3 	322222222	3 (K(bccq,$$ 	7 ))++1133A6I	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7%
7>>*++ 	7D!1488==??@@JJLLII%%"PWZWabbbbQS]^^^(sz:::::=CJOOOOOKOVYV`aaaaKMzZZZ$3:6666
,z3
'
'C*3//KAxKKQ5H5H5J5JKKKL
Kd6/#94@@!DDD	
%v
-...1	i		 H		$))EEHEEEFFGGGH H H H H H H H H H H H H H H 
lC	 	  (FOOL11 	( 	(Dt{{}}%%Dyy!K!Ktzz#rr7J!K!K!KLLHLL$''''	(( ( ( ( ( ( ( ( ( ( ( ( ( ( (
+F 		2333 	+I)*****	+ 	+sU   A& &A98A9-CCC
3K		KK$BM==NN	O O	c           
         t                       t          d|            t          j                            |           d         }|r3t          j                            |          st          j        |           t          j        |           }d}|                    d          D ]}|                    |           t                       |                    dt          z             |
                                 t          t          d                    D ]\  }}|dz  dk    r=t          d|z  d	t          j        
           t          j                                         |                    d                              d          \  }}|                    d||f           t                       |
                                 t          t          d                    D ]\  }}|dz  dk    r=t          d|z  d	t          j        
           t          j                                         |                    d                              d          \  }	}
|                    d|	|
f           t                       |
                                 t          t          d                    D ]\  }}|dz  dk    r=t          d|z  d	t          j        
           t          j                                         |                    d                              d          \  }}}}}}|                    d||||||f           t                       |
                                 d S )NzUploading tor   au  
    DROP TABLE IF EXISTS stats;
    DROP TABLE IF EXISTS species;
    DROP TABLE IF EXISTS synonym;
    DROP TABLE IF EXISTS merged;
    CREATE TABLE stats (version INT PRIMARY KEY);
    CREATE TABLE species (taxid INT PRIMARY KEY, parent INT, spname VARCHAR(50) COLLATE NOCASE, common VARCHAR(50) COLLATE NOCASE, rank VARCHAR(50), track TEXT);
    CREATE TABLE synonym (taxid INT,spname VARCHAR(50) COLLATE NOCASE, PRIMARY KEY (spname, taxid));
    CREATE TABLE merged (taxid_old INT, taxid_new INT);
    CREATE INDEX spname1 ON species (spname COLLATE NOCASE);
    CREATE INDEX spname2 ON synonym (spname COLLATE NOCASE);
    ;z(INSERT INTO stats (version) VALUES (%d);rY  i  zInserting synonyms:     % 6drN  )r   r!   rZ  rO  z2INSERT INTO synonym (taxid, spname) VALUES (?, ?);r\  zInserting taxid merges: % 6dz8INSERT INTO merged (taxid_old, taxid_new) VALUES (?, ?);rK  zInserting taxids:      % 6dz[INSERT INTO species (taxid, parent, spname, common, rank, track) VALUES (?, ?, ?, ?, ?, ?);)r'   r$   r%   rM   r&   r^  r   r   r   r   commitr  r   r(   r)   flushr+  )r   rl  r   
create_cmdr@   r  rB  rY   rZ   	taxid_old	taxid_newparentidr  r   r   s                  r   ri  ri  %  s@   	GGG	.&!!!w}}V$$Q'H x00 
		 	 BJ $$  


3	GGGJJ9:EFFFIIKKKT)__-- Z Z4T6Q;;2A53SZPPPPJ

4((..t44v


G%QWYYYY	GGGIIKKKT,//00 g g4T6Q;;2A53SZPPPPJ#zz$//55d;;	9


MPY[dOeffff	GGGIIKKKT*--.. d d4T6Q;;114#CJOOOOJ9=D9I9I9O9OPT9U9U6xw


psx  {C  EK  MS  UY  [b  sc  	d  	d  	d  	d	GGGIIKKKKKr/   __main__	hominidaehomoT)r   r   r3   )+
__future__r   r   r(   r$   cPickler   r`  collectionsr   r   hashlibr   r   rP   re  r   	six.movesr   rv   __all__r   r%   r9   environr   r"   r
   objectr	   rI  rS  r1   ri  r#  ncbir   ar'   r   r   br   rb   r/   r   <module>r     s  X ' & & & & & % % % % % % 



 				   MMMMM - , , , , , , ,          



        -
.
bjnnVS99=-XX !/    *hB hB hB hB hBv hB hB hB|> > >@   8+ 8+ 8+ 8+t3 3 3j z8::D  --A	E!HHH	E$


"
"###	E$

A

  DVZ [[A	E$
#
#A
&
&'''	E$


"
"##### s    	''