
    瞤d                         d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	Z	ddl
mZ d Z ej        d	          Z eeeeeg          Zd
gZ G d de          Z G d de          Z G d d
e          ZdS )a  
'phylomeDB3' provides an access API to the data stored in the phylogenetic
database PhylomeDB *[1][2].

Methods to perform queries are implemented within the PhylomeDB3Connector class.

 *[1] PhylomeDB: a database for genome-wide collections of gene phylogenies.
      Jaime Huerta-Cepas, Anibal Bueno, Joaquin Dopazo and Toni Gabaldon.
      Nucleic acids research (database issue). 2008.

 *[2] PhylomeDB v3.0: an expanding repository of genome-wide collections of
      trees, alignments and phylogeny-based orthology and paralogy predictions.
      Jaime Huerta-Cepas, Salvador Capella-Gutierrez, Leszek P. Pryszcz, Ivan
      Denisov, Diego Kormes, Marina Marcet-Houben and Toni Gabaldon T.
      Nucleic acids research (database issue). 2010.

      PhylomeDB is a database of complete phylomes derived for different genomes
      within a specific taxonomic range. All phylomes in the database are built
      using a high-quality phylogenetic pipeline that includes evolutionary
      model testing and alignment trimming phases. For each genome, PhylomeDB
      provides the alignments, phylogentic trees and tree-based orthology
      predictions for every single encoded protein.
    )absolute_importN)strip   )	PhyloTree)mapc                 8    |                      d          d         S )N_   )split)names    9lib/python3.11/site-packages/ete3/phylomedb/phylomeDB3.pyextract_species_namer   H   s    	C	    ^[Pp][Hh][Yy]\w{7}(_\w{2,7})?$PhylomeDB3Connectorc                   2    e Zd Zd Zd
dZddZddZd	 ZdS )Phylomec                 :   d| j         d| j        dd| j        | j                 z  z   d| j        z  z   dt          | j                  z  z   dt          | j                  z  z   dt          | j                  z  z   d	d
	                    | j
                  z  z   }|S )NzPhylome z (z)
z seed species: %s
z seed proteome: %s
z Species: %d
z Seed sequences: %d
z Trees: %d
z Proteome list: %s
,)phyidr   tax2name
seed_taxidseed_proteomelenspeciesseed_idstreesjoin	proteomes)selfinfos     r   __str__zPhylome.__str__R   s     !%DIII6do >?@!345 	#dl+++, 	 T]!3!33	4
 	DJ'( 	 8 889D Kr   Tc                 .    | _         | _        |                    t          j        j                   _        |                    t          j        j                   _        dd|z  z   } j        	                    |            j        
                                }| _        |d          _        |d          _        |d          _        |d          _        dd|z  z   } j        	                    |            j                                        }d |D              _        d	d
d                    t'          t(           j                            z  z   } j        	                    |            j                                        }i  _        i  _        i  _        |D ]$\  }}	}
|
 j        |<   |	 j        |<   | j        |	<   % fd|D              _        d j         j                  j        fz   _         j         r"d}|dz  }| j        d j        dz  }|dz  }nd}|d j        z  z  }|d j        z  z  } j        	                    |           d  j                                        D              _                                          d S )NzISELECT seed_taxid, seed_version, name, description, comments FROM phylomez WHERE phylome_id = %sr   seed_versionr   descriptionz*SELECT taxid, version FROM phylome_contentc                 8    g | ]}t          |d                    S r   )int.0es     r   
<listcomp>z$Phylome.__init__.<locals>.<listcomp>q   s"    222!C!II222r   %SELECT taxid, code, name FROM speciesz WHERE taxid IN (%s);r   c                 b    g | ]+}j         |d                   t          |d                   z   ,S )r   r
   )tax2codestr)r*   r+   r    s     r   r,   z$Phylome.__init__.<locals>.<listcomp>   s3    HHHdmAaD)#ad))3HHHr   z%s%d@SELECT DISTINCT CONCAT("Phy", i.longest, "_", s.code) AS protid >FROM protein AS p, isoform AS i, species AS s WHERE p.taxid =  AND p.version =  AND p.protid = i.isoform0 AND p.version = i.version AND p.taxid = s.taxid@SELECT DISTINCT CONCAT("Phy", protid, "_", code) AS protid FROM 4protein AS p, species AS s WHERE p.taxid = %s AND p."version = %s AND p.taxid = s.taxidc                     g | ]
}|d          S r'    r)   s     r   r,   z$Phylome.__init__.<locals>.<listcomp>   s    999aQqT999r   )filter_isoforms_conncursorMySQLdbcursors
DictCursor_dsqlCursor_lsqlexecutefetchoner   r   prot_vsr   r%   fetchallr   r   r   r0   r   r/   code2taxr   r   r   
load_trees)r    r   	connectorr;   cmdphyinfo
phycontentsp_infotaxidcoder   s   `          r   __init__zPhylome.__init__\   s   *DDJ!!'/"<==DJ!!'/"899DJ V E*+CJsj!!##GDJl+DO>*DLDI}-D
6 E*+CJs$$&&J22z222DL 2#c4<*@*@!A!AABCJsj!!##GDMDMDM$ " "tT!dmE!dmE!dmDHHHHZHHHDN$-"@$,!OOD  COc	MMc	RVR^R^R^__c	??ccOc	CtWWc	1T\BBcJs994:#6#6#8#8999DMOOr   Nbest_lkc                    d }|s#t          t          || j                            }nt          t          ||                    }d}dd|z  z   d| j        dd                    |          dz   d	z   }i | _        | j                            |          rH| j        | j	                 }| j        
                                D ]\  }	}
}}d
|	d|}||
|g| j        |<   dS dS )z
    Returns all newick tree for the given set of seqnames and
    model. If no seqnames are provided, all available trees in the
    phylome is returned.
    c                     d }t          j        d|           }|r# ||                                d                   S d S )Nc                     d| z  S )N"%s"r:   )xs    r   <lambda>z8Phylome.load_trees.<locals>.clean_name.<locals>.<lambda>   s
    	 r   zPhy(\w{7})_[\w\d]+r   )researchgroups)r   quotems      r   
clean_namez&Phylome.load_trees.<locals>.clean_name   sJ    !!e
)($
/
/a	
 $uQXXZZ]###$ $r   treez<SELECT  temp.protid, temp.method, temp.lk, temp.newick from z? (SELECT T.protid , T.method, T.newick, T.lk FROM %s AS T WHEREz T.phylome_id = z AND T.protid IN (r   )z, ORDER BY lk DESC) AS temp GROUP BY protid; Phyr	   N)listr   r   r   r   r   rC   rD   r/   r   rG   )r    seqnamesmodelanotate_treesr^   seqids
tree_tablerK   	seed_codeprotidmethodlknwseqids                 r   rI   zPhylome.load_trees   s,   $ $ $  0ST]3344ffSX..//fJ
HIJWX X48JJJ@P@P@P@PQR 	77C
 DJz# --0i$(J$7$7$9$9 - -
 &&"bb#VVYY/,
5	- -- -r   cleanc                     d S Nr:   )r    rm   atypes      r   get_algszPhylome.get_algs       Dr   c                     d S rp   r:   r    s    r   get_seed_specieszPhylome.get_seed_species   rs   r   T)NrR   T)rn   )__name__
__module____qualname__r"   rQ   rI   rr   rv   r:   r   r   r   r   Q   so          3 3 3 3j- - - ->	 	 	 		 	 	 	 	r   r   c                   8    e Zd Z	 	 ddZd ZddZdd	ZddZdS )	PhylomeDB84.88.66.245phylomedb_3public  c           	      b   	 t          j        ||||t          |                    | _        n#  t	          d          xY w| j                            t           j        j                  | _        || _	        || _
        || _        || _        || _        d| _        d| _        d| _        d| _        dS zf Connect to a phylomeDB database and return an object to perform queries
        to the database.
    )hostuserpasswddbportz'ERROR: Check your connection parameterstree_publicalignment_publicphylome_publicphycontent_publicNr>   connectr(   _SQLconnection	NameErrorr=   r?   r@   _SQLr   r   r   r   r   _trees_algs	_phylomes_phy_contentr    r   r   r   r   r   s         r   rQ   zPhylomeDB.__init__   s    A#O4RD		+ + +dA?@@@#**7?+EFFDI DGDIDIDIDK  DK#DJ%DN+D   +. ?c                     d S rp   r:   r    r   s     r   get_phylomezPhylomeDB.get_phylome   rs   r   Nc                     |g }d S rp   r:   )r    protidss     r   get_proteomeszPhylomeDB.get_proteomes   s    gDr   c                     |g }dS )zget aa sequence of seqidsNr:   )r    rf   s     r   get_seqszPhylomeDB.get_seqs   s    ~fDr   Tc                     dS )z%Returns all available trees for seqidNr:   )r    rm   include_collaterals      r   search_treeszPhylomeDB.search_trees   s    Dr   r}   r~   r   r   r   rp   rw   )rx   ry   rz   rQ   r   r   r   r   r:   r   r   r|   r|      sy        GO", , , ,6	 	 		 	 	 	
	 	 	 		 	 	 	 	 	r   r|   c                   N   e Zd ZdZd Z	 	 d9dZd Zd	 Zd
 Zd Z	d Z
d Zd Zd Zd Zd Z	 	 d:dZd Zd Zd Zd Zd Zd Zd Zd;dZd Zd<dZd  Zd! Zd=d"Zd# Zd$ Zd>d%Z d& Z!d' Z"d( Z#d) Z$d* Z%d?d+Z&d, Z'd- Z(d>d.Z)d/ Z*d0 Z+d1 Z,d>d2Z-d3 Z.d4 Z/d5 Z0d>d6Z1	 	 d@d8Z2dS )Ar   at   Returns a connector to a phylomeDB3 database.

  ARGUMENTS:
  ==========
    db: database name in the host server.
    host: hostname in which phylomeDB is hosted.
    user: username to the database.
    port: port used to connect database.
    passwd: password to connect database.

  RETURNS:
  ========
    An object whose methods can be used to query the database.
  c                 .    t          || j                  S )N)rJ   )r   r   r   s     r   _get_phylomez PhylomeDB3Connector._get_phylome   s    5d&9::::r   r}   r~   r   r   c           	      b   	 t          j        ||||t          |                    | _        n#  t	          d          xY w| j                            t           j        j                  | _        || _	        || _
        || _        || _        || _        d| _        d| _        d| _        d| _        dS r   r   r   s         r   rQ   zPhylomeDB3Connector.__init__   s    A#O4RD		+ + +dA?@@@#**7?+EFFDI DGDIDIDIDK  DK#DJ%DN+Dr   c           
         	 | j                             t          j        j                  | _        n#  	 t          j        | j        | j        | j	        | j
        t          | j                            | _         | j                             t          j        j                  | _        n#  t          d          xY wY nxY w| j                            |          S )z Check whether a given connection is still open or not. If the connection
        has been closed by the server, try to recover it. Then, execute the
        input MySQL query
    )r   r   r   r   r   z.ERROR: Impossible to reconnect to the database)r   r=   r>   r?   r@   r   r   r   r   r   r   r(   r   r   rD   )r    commands     r   __execute__zPhylomeDB3Connector.__execute__  s    	K%,,W_-GHHdiiKK%oTYdgT[TYI I I
 '..w/IJJ		JHIII	 9W%%%s!   .1 B=AB'8/B='B88B=c                     d t          t          |                    d                    D             D ]&}|                     |          st	          d          'dS )z' Executes a multi-line MySQL query
    c                     g | ]}||S r:   r:   )r*   ls     r   r,   z9PhylomeDB3Connector.__execute_block__.<locals>.<listcomp>6  s    BBBB!BBBr   ;z5__execute_block__ An error occurred during a MySQL opN)r   r   r   r   r   )r    commandsquerys      r   __execute_block__z%PhylomeDB3Connector.__execute_block__1  sr    
 CBSs(;(;<<BBB Q Qe$$ QOPPPQQ Qr   c                     t          |          t          k    rd}t          d|z            |D ]}||vrt          d          i }|D ]}||||         <   |||                  |= |S )z Takes as an input the result of a MySQL query and returns a dictionary
        using as a key one of the values from the MySQL results
    z=__format_MySQL_to_dict__ Check the input structure datatype. z%s It should be a 'tuple'z)__format_MySQL_to_dict__ Define the index)typetupler   )r    indexdictmsgentryformatted_dicts         r   __fomat_MySQL_to_dict__z+PhylomeDB3Connector.__fomat_MySQL_to_dict__:  s     DzzUKc2s;<<< E Ee^^CDDD  N . .%*nU5\"
u
&u
-
-r   c                 >   |st          dt          |          z            d t          |          t          k    r|gn|}t          |          t          v r'd                    fd|D                       }|r|ndS t          dt          |          z            )zN Returns a string with the input id/s ready to be used in any MySQL query
    z*__parser_ids__: Check your input data '%s'c                 N    t                               |           r
| dd         nd S )N   
   )
ID_PATTERNmatch)ids    r   rX   z4PhylomeDB3Connector.__parser_ids__.<locals>.<lambda>U  s%    J$4$4R$8$8B1R4d r   z, c                 B    g | ]} |          d  |          z  S )rV   r:   )r*   nparsers     r   r,   z6PhylomeDB3Connector.__parser_ids__.<locals>.<listcomp>[  s3    HHH!ffQiiH6FF1II-HHHr   N)r   r0   r   ITERABLE_TYPESr   )r    idsparsedr   s      @r   __parser_ids__z"PhylomeDB3Connector.__parser_ids__O  s      QCs3xxOPPPBBF99##3%%C CyyN""zzHHHHsHHHIIf'VV4'
ASXXM
N
NNr   c                 r   t          g d          }t          |          |z
  t                      k    rt          d          |D ]}|dvr||         s||         dvr dS d|v rJ|d         rBt          |d                                                   rt          |d                   dk    rdS d|v r)t          |d                                                   sdS d	|v r t	          |d	                   t
          k    rdS d
|v r;|                     |d
                   rt	          |d
                   t          k    rdS d|v r|                     |d                   sdS d|v rS|d         rKt	          |d                   t          k    s+t          |d                                                   dk    rdS dS )z* Check the different input parameters
    )
str_number	single_idlist_idrP   stringbooleannumberz'check_input_parameter: Invalid datatyper   r   )r    FFr   0r   r   r   r   r   r
   T)	setr   r0   isdigitr   boolr   r   r   )r    kargs
valid_keyskeys       r   __check_input_parameter__z-PhylomeDB3Connector.__check_input_parameter__`  s   
      J 5zzJ#%%''?@@@   	(	(	(s	(:^++ 5	x 5?##++-- 	U8_1E1E1L1LuS|)<%=%=%E%E%G%GUEd5#344<<U e  {!344 U;  C''uE$"5"5eI6F"G"GU 5	x h  C''3uX/D/D/F/F+G+G!+K+K 4r   c                    |                      |          }d}|dz  }|d|z  z  }|dz  }i }|                     |          r| j                                        D ]k}|                    |d         i            ||d                                      |d         t                                                    |d                    l|D ]1}||         D ]&}t          ||         |                   ||         |<   '2|S )z Returns all the external IDs registered in the 'external_id' table that
        are associated to the input phylomeDB IDs
    @SELECT DISTINCT CONCAT("Phy", p.protid, "_", s.code) as protid, zCexternal_db AS db, external_id AS id FROM protein AS p, species AS z:s, external_id AS ex WHERE p.protid IN (%s) AND p.taxid = z s.taxid AND p.protid = ex.protidri   r   r   r   r   r   rG   
setdefaultr   addrb   r    r   rK   external_idsrowri   r   s          r   get_external_idsz$PhylomeDB3Connector.get_external_ids  s,   
 

c
"
"CMCPPCG3OOC--CL P##%% P P#Hr222S]#..s4y#%%@@DDSYOOOO D Df% D D#$(f)=c)B$C$CVS!!D r   c                    |                      |          }d}|dz  }|d|z  z  }|dz  }i }|                     |          r| j                                        D ]k}|                    |d         i            ||d                                      |d         t                                                    |d                    l|D ]1}||         D ]&}t          ||         |                   ||         |<   '2|S )zJ Returns all available GO Terms associated to the input phylomeDB IDs
    r   zCexternal_db AS db, CONCAT("GO:", go_term) AS go FROM protein AS p, z8species AS s, go WHERE p.protid IN (%s) AND p.taxid = s.ztaxid AND p.protid = go.protidri   r   gor   r   s          r   
get_go_idszPhylomeDB3Connector.get_go_ids  s,    

c
"
"CMCPPCEMMC++CL P##%% P P#Hr222S]#..s4y#%%@@DDSYOOOO D Df% D D#$(f)=c)B$C$CVS!!D r   c                    |                      |          }d}|dz  }|dz  }|d|z  z  }i }|                     |          r| j                                        D ]k}|                    |d         i                               dt                                 ||d                  d                             |d                    l|D ]1}||         D ]&}t          ||         |                   ||         |<   '2|S )zY Returns all old phylomeDB IDs associated to each of the input phylomeDB
        IDs
    r   zCCONCAT(old_code, LPAD(old_protid, 7, "0")) AS old_code FROM proteinz: AS p, species AS s, old_protein AS old WHERE p.protid IN z4(%s) AND p.taxid = s.taxid AND p.protid = old.protidri   old_phylomedbold_coder   )r    r   rK   old_idsr   ri   r   s          r   get_old_phylomedb_idsz)PhylomeDB3Connector.get_old_phylomedb_ids  s$   
 

c
"
"CMCPPCGGCASIICG E##%% E E#3x="--88#%%PPPH/33C
ODDDD : : : :##GFOC$899: Nr   c                    |                      |          }d}|dz  }|d|z  z  }i }|                     |          r| j                                        D ]}|d         ri|                    |d         i                               dt                                 ||d                  d                             |d                    |d         ri|                    |d         i                               dt                                 ||d                  d                             |d                    |D ]1}||         D ]&}t          ||         |                   ||         |<   '2|S )z_ Returns all possible protein and gene names associated to the input
        phylomeDB IDs
    r   zCprot_name, gene_name FROM protein AS p, species AS s WHERE p.protidz IN (%s) AND p.taxid = s.taxid	prot_nameri   protein_name	gene_namer   )r    r   rK   r   r   ri   r   s          r   get_prot_gene_namesz'PhylomeDB3Connector.get_prot_gene_names  s   
 

c
"
"CMCPPC+s33CD A##%% A A#{ 	D
//#h-
,
,
7
7
N
N
N
s8}
n
-
1
1#k2B
C
C
C{ 	A
//#h-
,
,
7
7SUU
K
K
K
s8}
k
*
.
.s;/?
@
@
@ 4 4f 4 4# fc!233VS4 Kr   c                     d}t          j        ||          sdS d}|dz  }|d|dd         d|dd         dz  }|                     |          r| j                                        d	         S dS )
zE Return the conversion between an old phylomeDB ID and a new one
    ^\w{3}\d{1,}$Nz?SELECT CONCAT("Phy", p.protid, "_", s.code) as protid FROM old_zCprotein AS p, species AS s WHERE (s.taxid = p.taxid AND p.old_code z= "r   z" AND p.old_protid = r`   ri   )rY   r   r   r   rE   )r    old_idQUERY_OLD_REGEXP_FILTERrK   s       r   get_new_phylomedb_idz(PhylomeDB3Connector.get_new_phylomedb_id  s    
 .8+V44 T MCPPCCfRaRjjj&***EEC ,Y!!(++4r   NFc                 d   i }||                      ||d          }n9|r|s|                      ||d          }n|r|r|                      |||          }|s|S t          |                                          d         }|r^|                    di            ||d         d<   ||         d         |d         d<   ||         d         |d         d<   |sdnd	|d         d
<   ||         d                                         }	t          d |	D                       }
|                     |
          }t          |	          t          t          |	                    k    rdnd	}d}|dz  }|dz  }|dz  }|d| j        d| j	        dz  }|d| j
        z  z  }|dz  }|d|z  z  }|d|z  z  }|dz  }|                     |          r|                    di            | j                                        D ]}|d                             |d         i            |d         |d         |d                  d<   |d         |d         |d                  d<   |d         |d         |d                  d<   |d         |d         |d                  d<   |d         |d         |d                  d<   |d          |d         |d                  d!<   |d"         |d         |d                  d#<   |r]d$}|d%z  }|d&|z  z  }|                     |          r9| j                                        D ]}|d         |d         |d                  d<    t          j        |                     |
                    D ]8\  }}|d         |                             d'i            ||d         |         d'<   9t          j        |                     |
                    D ]N\  }}|d         |                             d'i            |d         |         d'                             |           Ot          j        |                     |
                    D ]N\  }}|d         |                             d'i            |d         |         d'                             |           Ot          j        |                     |
                    D ]N\  }}|d         |                             d'i            |d         |         d'                             |           O|                    d(|	           d)}|d*z  }|| j        d| j	        d+z  }|d,|z  z  }|d-|z  z  }|d.z  }|                     |          r|                    d/i            | j                                        D ]}|d         }|d         |d                  d         d0k    r|r	 |s|d         d0k    s||	vr>|d/                             |i            |d1         |d/         |         d2<   |d3         |d/         |         d4<   |d         |d/         |         d5<   |S )6z Return all the available information for a given set of homologous
        sequences extracted from a tree from a given phylome.
    NT)	best_treerj   r   r_   rj   rk   Fbestc                 n    g | ]2}d                      |                    d           dd                   3S )r	   Nr   )r   r   )r*   r   s     r   r,   z@PhylomeDB3Connector.get_info_homologous_seqs.<locals>.<listcomp>'  s7    @@@sxx

3+,,@@@r   zCSELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, s.code, CONCzCAT(s.code, ".", p.version) AS proteome, p.taxid, p.version, s.name,zC MAX(copy) AS copy, count(DISTINCT method) AS trees, count(DISTINCTzC sf.protid, sf.phylome_id) AS collat FROM (protein AS p, species ASz s,  AS ph, z AS pc) LEFT z0JOIN %s AS t ON (p.protid = t.protid) LEFT JOIN zBseed_friend AS sf ON (p.protid = sf.friend_id) WHERE (p.protid IN z)(%s) AND p.taxid = s.taxid AND p.taxid = z4pc.taxid AND pc.phylome_id = %s AND ph.phylome_id = z;pc.phylome_id AND pc.version = p.version) GROUP BY p.protidseqri   copyr   rO   proteomecollat
collateralr   species_namerP   species_codez@SELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, seq FROM zAprotein AS p, species AS s, unique_protein AS u WHERE(p.protid INz4 (%s) AND p.taxid = s.taxid AND p.protid = u.protid)external
leaf_namesz=SELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, copy, z6prot_name, gene_name FROM protein AS p, species AS s, z AS pc WHERE (p.z6protid IN (%s) AND p.taxid = s.taxid AND p.taxid = pc.z4taxid AND pc.phylome_id = %s AND ph.phylome_id = pc.z&phylome_id AND pc.version = p.version)leavesr
   r   gener   proteincopy_version)get_treerb   keysr   get_leaf_namesr   r   r   r   r   r   r   r   rG   six	iteritemsr   r   updater   r   )r    ri   
phylome_idr_   tree_methodsequencedatatree_dbrj   r  r   r   copy_var_supportrK   r   r  rP   s                    r   get_info_homologous_seqsz,PhylomeDB3Connector.get_info_homologous_seqs  s    D|fjdCCgg	 Hk HfjdCCgg	 H+ Hfj;GGg  'kGLLNN##A&f  @
oofb!!!%d6l8"6?40d6l4$V_V4d6l6)4?TT%d6l6
 V_V$3355F
@@@@@
A
AC!!#&&G  #6{{c#f++.>.>>>ttE QCPPCPPCPPCCdnnnT=N=N=NOOC=MMCOOC6'BBCAZPPCHHC A
ooeR   ##%% 	A 	A#Us8}b111-0[UCM"6*.1'lUCM"7+.1'lUCM"7+14ZUCM":.36x=UCM"<058[UCM">258[UCM">22  9Oc	PPc	CwOOc			#		 99%%'' 	9 	9C.1%j$u+c(m
$U
+
+  M$*?*?*D*DEE 1 1
5k&$$Z444(0d5k&*%%M$//#*>*>?? 7 7
5k&$$Z444
5k&*%,,X6666M$*D*DS*I*IJJ 7 7
5k&$$Z444
5k&*%,,X6666M$*B*B3*G*GHH 7 7
5k&$$Z444
5k&*%,,X6666 	OOL&))) KCCCCT^^^T=N=N=NOOCCwOOCAZPPC33C  ;
ooh#####%% ; ;#8};s8}%f-116F1
 	CK!OO46>>
X!!$+++'*;'7XtV$*-k*:XtY'/26{Xt^,,Kr   c                    |                      |          st          d          |                     |          }d}|d|d|dz  }|                     |          si S |                     d | j                                        D                       }d}|d	z  }|d
|z  z  }i }|                     |          r:| j                                        D ] }t          |d                   ||d         <   !|S )zD Returns all the isoforms registered for the input phylomeDB ID
    r   z'get_all_isoforms: Check your input dataz@SELECT CONCAT("Phy", isoform) AS i, CONCAT("Phy", longest) AS l zFROM isoform WHERE(isoform = z OR longest = r`   c                 (    g | ]}|D ]
}||         S r:   r:   )r*   rks      r   r,   z8PhylomeDB3Connector.get_all_isoforms.<locals>.<listcomp>  s)    MMM1MMaqtMMMMr   zASELECT CONCAT("Phy", p.protid, "_", code) AS protid, LENGTH(seq) zCAS ln FROM protein AS p, species AS s, unique_protein AS u WHERE p.z<protid IN (%s) AND p.protid = u.protid AND p.taxid = s.taxidlnri   )r   r   r   r   r   rG   r(   )r    r   ri   rK   r   isoformsr   s          r   get_all_isoformsz$PhylomeDB3Connector.get_all_isoforms  s)   
 ))b)99 A?@@@  $$F NCCPPC C   i


MMTY-?-?-A-AMMM
N
NC OCPPCICPPCH 1##%% 1 1#"%c$i..XOr   c                    |                      |          st          d          |                     |          }d}|dz  }|dz  }|d|z  z  }i }|                     |          r| j                                        D ]}|                    |d         i                               t          |d                   g            ||d                  t          |d                                                |d	                    |S )
z: Returns the longest isoform for a given phylomeDB ID
    r  z2get_longest_isoform fuction: Check your input datazBSELECT DISTINCT CONCAT("Phy", p.protid, "_", s.code) as protid, s.z>code, p.version FROM protein AS p, species AS s, isoform AS i zBWHERE (i.longest = p.protid AND i.version = p.version AND p.taxid z= s.taxid AND i.isoform = %s)rP   versionri   )	r   r   r   r   r   rG   r   r(   append)r    r   ri   rK   r  r   s         r   get_longest_isoformz'PhylomeDB3Connector.get_longest_isoform  s   
 ))b)99 LJKKK  $$F PCKKCOOC*f55CH  I##%% I I#CK,,77C	N8K8KRPPPVc#i.11299#h-HHHHOr   c                    |                      |          st          d          d}|dz  }|d|z  z  }i }|                     |          r| j                                        D ]}t          j        |                     |d                             D ]_\  }}t          j        |          D ]E\  }}|                    |i                               |g            ||         |xx         |z  cc<   F`d}|d|z  z  }|d|z  z  }|                     |          r| j                                        D ]}t          j        |                     |d                             D ]_\  }}t          j        |          D ]E\  }}|                    |i                               |g            ||         |xx         |z  cc<   F`|D ]>}||         D ]3}t          t          ||         |                             ||         |<   4?|S )	z> Returns the protein id associated to a given external id
    )rP   z)get_id_by_external: Check your input datazBSELECT DISTINCT CONCAT("Phy", p.protid, "_", code) AS protid FROM zBexternal_id AS e, protein AS p, species AS s WHERE (e.external_id z5= "%s" AND e.protid = p.protid AND p.taxid = s.taxid)ri   z6protein AS p, species AS s WHERE ((prot_name = "%s" ORz) gene_name = "%s") AND p.taxid = s.taxid))r   r   r   r   rG   r
  r  r  r   rb   r   )	r    r  rK   r   r   spr   r   proteinss	            r   get_id_by_externalz&PhylomeDB3Connector.get_id_by_external  sk    ))):: CABBB PCOOCBhOOC
C $##%% $ $#M$*B*B3x=*Q*QRR 	$ 	$LB!mH55 $ $lb(NN2r""--b"555GBKKK8#KKKK$	$ PCCxPPC6(CCC  $##%% $ $#M$*B*B3x=*Q*QRR 	$ 	$LB!mH55 $ $lb(NN2r""--b"555GBKKK8#KKKK$	$
  9 9"g 9 9( SWX%6!7!788B9 Jr   c                    |                      |          st          d          |                                }d}d}d}i }t          j        ||          r|                     |          }|i k    rAt          j        ||          r,|                     |          }|r|                     |          }|i k    r*t          j        ||          r|                     |          }|S )z Returns a list of the longest isoforms for each proteome where the ID is
        already registered. The ID can be a current phylomeDB ID version, former
        phylomeDB ID or an external ID.
    r   z search_id: Check your input dataz^[\w\d\-_,;:.|#@\/\\()'<>!]+$r   r   )r   r   r   rY   r   r  r   r#  )r    r   r   QUERY_GEN_REGEXP_FILTERr   QUERY_INT_REGEXP_FILTERphylomeDB_matches	currentIDs           r   	search_idzPhylomeDB3Connector.search_id  s    ))2)66 :8999HHJJE ?->	x'// :22599 B28,CU#K#K++E22i	 @ 44Y?? B28,CU#K#K11%88r   c                    |                      |          st          d          i }|                     |          }|r@|                    |t	          |                                          d                             |                     |          }|r@|                    |t	          |                                          d                             |                     |          }|r@|                    |t	          |                                          d                             |                     |          }|r@|                    |t	          |                                          d                             |S )zE Returns all the registered translations of a given phylomeDB ID
    r  z*get_id_translations: Check your input datar   )	r   r   r   r  rb   r  r   r   r   )r    r   
conversionr!   s       r   get_id_translationsz'PhylomeDB3Connector.get_id_translations	  sZ   
 ))b)99 DBCCCJ  $$D 4T$))++..q12333??2D 4T$))++..q12333%%b))D 4T$))++..q12333##B''D 4T$))++..q12333r   c                     d||fz  }|                      |          r#d | j                                        D             S g S )NzNSELECT protid, prot_name, gene_name FROM protein WHERE taxid=%d AND version=%dc                 <    g | ]}|d          |d         |d         fS )ri   r   r   r:   r*   r  s     r   r,   zEPhylomeDB3Connector.get_translations_for_proteome.<locals>.<listcomp>*  s+    ZZZq{AkNAkN;ZZZr   )r   r   rG   )r    rO   r  rK   s       r   get_translations_for_proteomez1PhylomeDB3Connector.get_translations_for_proteome%  sS    [	C  ZZTYEWEWEYEYZZZZir   c                 .   |                      |          st          d          |                     |          }d}|dz  }|d| j        d|dz  }|dz  }|d	z  }|                     |          r#d
 | j                                        D             S g S )zq Return the trees where the protid is presented as part of the homolog
        sequences to the seed protein
    r   z+get_collateral_seeds: Check your input datazASELECT DISTINCT CONCAT("Phy", p.protid, "_", code) AS protid, sf.zCphylome_id AS phylome FROM seed_friend AS sf, species AS s, proteinz AS p, z AS ph WHERE (friend_id IN (r`   zC AND sf.protid = p.protid AND p.taxid = s.taxid AND ph.phylome_id =z sf.phylome_id)c                 .    g | ]}|d          |d         fS )ri   phylomer:   r0  s     r   r,   z<PhylomeDB3Connector.get_collateral_seeds.<locals>.<listcomp>C  s%    HHHaq{AiL)HHHr   )r   r   r   r   r   r   rG   )r    ri   rK   s      r   get_collateral_seedsz(PhylomeDB3Connector.get_collateral_seeds/  s     ))F);; ECDDD  ((F OCPPCCPPCPPCC  IHH493E3E3G3GHHHHIr   c                 B   |                      ||          st          d          |                     |          }|r|rd}|dz  }t          |          d| j        z  }|d|d|dz  }|r|d	d
         dz   }n|r|d	d
         d|z  z   }i }|                     |          rz| j                                        D ]`}	|	d         |                    |	d         i           d<   t          |	d         t                    |                    |	d         i           d<   a|S )a   Depending in the input parameters select either
        .- a tree with the best evolutionary model in terms of LK (best_tree)
        .- a tree reconstructed using a specific model (method)
        .- all available model/trees for the tuple (phylomeDB ID, phylome ID)
    r   r   get_tree: Check your input dataz>get_tree: Impossible to ask for the best model and ask at the z+same time for a specific evolutionary model0SELECT newick AS tree, method, lk FROM %s WHERE 
(protid =  AND phylome_id = r`   Nz- AND method != "NJ") ORDER BY lk DESC LIMIT 1z AND method = "%s")rk   rj   r_   sp_naming_function)
r   r   r   r   r   r   rG   r   r   r   )
r    r   r  rj   r   ri   r   rK   r   r   s
             r   r  zPhylomeDB3Connector.get_treeF  se    ))bz)RR 97888  $$F ) Mc	::ccNN >MCC&&&***EEC 8HFFcc	 8H,77cE v##%% v v#47IX++D16?F`t6u6u6uX++F33Lr   c                 P   |                      ||          st          d          |                     |          }d| j        z  }|d|d|dz  }|dz  }i }|                     |          r8| j                                        }t          |d         t          	          |d<   |S )
zt return a tree for input id in the given phylome for the best fitting
        evolutionary model in terms of LK
    r8  r9  r:  r;  r<  z  AND methodz" != "NJ") ORDER BY lk DESC LIMIT 1r_   r>  )	r   r   r   r   r   r   rE   r   r   )r    r   r  ri   rK   r_   s         r   get_best_treez!PhylomeDB3Connector.get_best_treek  s     ))bz)RR 97888  $$F >MCCVVVZZZPPC//CD VY!!dtF|@TUUUd6lKr   Tc                 0   |                      ||          st          d          |                     |          }|s|si S |r|rdn|rdnd}d|d| j        d}|d	|d
|dz  }|                     |          r| j                                        S i S )zu Return the either the clean, the raw or both alignments for the input
        phylomeDB ID in the input phylome
    r8  zget_algs: Check your input datazraw_alg, clean_algraw_alg	clean_algzSELECT z, seqnumber FROM z WHERE z(phylome_id =  AND protid = r`   )r   r   r   r   r   r   rE   )r    r   r  rC  rD  ri   rK   s          r   rr   zPhylomeDB3Connector.get_algs  s     ))bz)RR 97888  $$F  9 i #* "i "

	>"YY! C 58CCDCC***fffEEC "Y!!!Ir   c                 
   |                      ||          st          d          d| j        z  }||d|                     |          dz  }|                     |          r| j                                        d         S dS )N Return the raw alignment for the input phylomeDB ID in the given phylome
    r8  "get_raw_alg: Check your input dataz+SELECT raw_alg FROM %s WHERE (phylome_id = rE  r`   rC  r   r   r   r   r   r   r   rE   r    r   r  rK   s       r   get_raw_algzPhylomeDB3Connector.get_raw_alg  s    
 ))bJ)OO <:;;;8DJGCJJJ0C0CB0G0G0G0GHHC -Y!!),,2r   c                 
   |                      ||          st          d          d| j        z  }||d|                     |          dz  }|                     |          r| j                                        d         S dS )rG  r8  rH  z-SELECT clean_alg FROM %s WHERE (phylome_id = rE  r`   rD  r   rI  rJ  s       r   get_clean_algz!PhylomeDB3Connector.get_clean_alg  s    
 ))bJ)OO <:;;;:djICJJJ0C0CB0G0G0G0GHHC /Y!!+..2r   c                     |                      |||          st          d          |r|                     ||d|          S |                     ||d          S )z Return all the available information for each sequence from tree/s
        asociated to a tuple (protein, phylome) identifiers.
    )r   r   r   z+get_seq_info_in_tree: Check your input dataT)r_   r  )r_   r   r   r  )r    r   r  rj   s       r   get_seq_info_in_treez(PhylomeDB3Connector.get_seq_info_in_tree  s     ))b6 *   ECDDD  H**2z$ +    **2z$*GGGr   c                 ~    |                      ||          st          d          |                     ||d          S )z Return all available information for the homologous sequences to the
        input phylomeDB ID in the input phylome using the best tree to compute
        the set of homologous sequences
    r8  z-get_seq_info_msf: Check your input parametersT)r  rO  )r    r   r  s      r   get_seq_info_msfz$PhylomeDB3Connector.get_seq_info_msf  sL     ))bz)RR GEFFF ((ZD(IIIr   c                    |                      |          st          d|z            |                     |          }d}|dz  }|dz  }|d|z  z  }|                     |          si S | j                                        }d}|d|z  z  }|                     |          si S | j                                        D ]}|d	         d
|d         }|                    dt                                	                    |           |                    di                               |d           |d         |         |d         k     r|d         n|d         |         |d         |<   |d         rO|                    di                               |t                                	                    |d                    |d         rO|                    di                               |t                                	                    |d                    |d         rO|                    di                               |t                                	                    |d                    | 
                    |d                   |d<   |                     |          }|r-|t          |                                          d                  ni |d<   |                     |          }|rF|d                             |t          |                                          d                             |                     |          }|rF|d                             |t          |                                          d                             |                     |          }|rF|d                             |t          |                                          d                             |S )z8 Returns available information about a given protid
    r  zWrong id [%s]zBSELECT DISTINCT CONCAT("Phy", p.protid, "_", s.code) AS protid, s.zBcode AS species_code, s.name AS species_name, p.taxid, u.seq FROM zBprotein AS p, unique_protein AS u, species AS s WHERE (p.protid = z1%s AND p.protid = u.protid AND p.taxid = s.taxid)zBSELECT version, copy, prot_name, gene_name, comments FROM protein zWHERE protid = %sr   .r  r   r   r   r   r  r   r  commentsri   r  r  )r   r   r   r   r   rE   rG   r   r   r   r  r   rb   r  r   r  r   r   )r    r   ri   rK   r  r   r   r!   s           r   get_seqid_infoz"PhylomeDB3Connector.get_seqid_info  s   
 ))b)99 +or)***  $$F PCOOCOOC>&IICC   i9D PC&))CC   iy!!##  ">222C	NNCh
ook355))--h777
oofb!!,,Xq999.26l8.DF/ /s6{{fh/ 6l8 
[	  	2&&11(CEEBB
#c+


	[	  ##..x??
#c+


	Z 
B''228SUUCC
#c*o


 ,,T(^<<D   $$D59AtD--a011rD??2D :
:d4		#4#4Q#78999%%b))D :
:d4		#4#4Q#78999##B''D :
:d4		#4#4Q#78999Kr   c           	      F   |                      ||          st          d          |                     |          }d}|d| j        z  z  }|d|z  z  }|dz  }|                     |          s|si S i }| j                                        D ]}|                    t          |d                   i            |t          |d                                                |d         t                                
                    |d	                    i }|D ]W}|                    |g            ||         D ]6}	||                             d
|	d ||         |	         D             g           7X|r|                     |          D ]\  }
}d| j        z  }||                     |
          d|z  }|                     |          rZ|                    t          |          g                               d|
d | j                                        D             g           |S )z Returns information about which methods have been used to reconstruct
        every tree for a given phylomeDB ID grouped by phylome
    )r   r   Check your input datazCSELECT DISTINCT phylome_id, method, CONCAT("Phy", p.protid, "_", s.z3code) AS protid FROM protein AS p, species AS s, %sz6 AS t WHERE (p.protid IN (%s) AND p.protid = t.protid zAND p.taxid = s.taxid)r  ri   rj   Tc                     g | ]}|S r:   r:   )r*   r]   s     r   r,   zFPhylomeDB3Connector.get_available_trees_by_phylome.<locals>.<listcomp>D  s    -L-L-LAa-L-L-Lr   z%SELECT method FROM %s WHERE protid = r<  Fc                     g | ]
}|d          S r   r:   r0  s     r   r,   zFPhylomeDB3Connector.get_available_trees_by_phylome.<locals>.<listcomp>M  s4     C+ C+ C+ DEX; C+ C+ C+r   )r   r   r   r   r   r   rG   r   r(   r   r   r  r6  )r    r   r   r   rK   tr  r   r5  ri   seeds              r   get_available_trees_by_phylomez2PhylomeDB3Connector.get_available_trees_by_phylome$  s    ))B*)MM /-...!!"%%G QC@DKPPCCwOOC##CC    i
AY!! N Nll3q'',,,AlO((8cee<<@@8MMMME  O Ow###gJ O O&gtV-L-L7F9K-L-L-LMNNNNO
  -44R88 - --$6$+F4+>+>t+D+D+D+DggNNC   	-


3w<<
,
,
3
3UD C+ C+Y''))C+ C+ C+ 5, - - - Lr   c                     |                      |          st          d          d| j        z  }|d|z  z  }|                     |          r-|                     d| j                                                  S i S )zK Retuns the frequency of each evolutionary method in the input phylome
    r   z"count_trees: Check your input dataz.SELECT method, count(*) AS freq FROM %s WHERE z!(phylome_id = %s) GROUP BY methodrj   )r   r   r   r   r   r   rG   r    r  rK   s      r   count_treeszPhylomeDB3Connector.count_treesR  s     ))z)BB <:;;;;t{KC.*==C J))(DI4F4F4H4HIIIIr   c                     |                      |          st          d          d| j        z  }|d|z  z  }|                     |          r,t	          | j                                        d                   S dS )z9 Returns how many alignments are for a given phylome
    r_  z!count_algs: Check your input dataz SELECT count(*) AS freq FROM %s WHERE phylome_id = %sfreqr   )r   r   r   r   r(   r   rE   r`  s      r   
count_algszPhylomeDB3Connector.count_algs`  s    
 ))z)BB ;9:::-<C"j11C 0$)$$&&v.///1r   c           	         |                      |          st          d          d}|d| j        d| j        dz  }|d|z  z  }|dz  }i }|                     |          rt| j                                        D ]Z}|                    |d	         i                               |d
         |d         t          |d         t                    g           [|S )z5 Returns all trees available for a given phylome
    r_  z(get_phylome_trees: Check your input dataz?SELECT CONCAT("Phy", protid, "_", code) AS protid, method, lk, znewick FROM  AS t, r   z.species AS s WHERE (ph.phylome_id = %s AND ph.z6phylome_id = t.phylome_id AND ph.seed_taxid = s.taxid)ri   rj   rk   newickr>  )
r   r   r   r   r   r   rG   r   r   r   )r    r  rK   r   r   s        r   get_phylome_treesz%PhylomeDB3Connector.get_phylome_treeso  s   
 ))z)BB B@AAA MCCT^^^LLC;zJJCCCCE  Z##%% Z Z#X++66s8}t9iHBVWWW
X	Z 	Z 	Z 	ZLr   c                    |                      |          st          d          d}|d| j        d| j        dz  }|d|z  z  }|dz  }i }|                     |          rf| j                                        D ]L}|d	         |                    |d
         i           d	<   |d         |                    |d
         i           d<   M|S )z: Returns all alignments available for a given phylome
    r_  z'get_phylome_tree: Check your input datazBSELECT CONCAT("Phy", protid, "_", code) AS protid, raw_alg, clean_z	alg FROM z AS a, z AS ph, species z3AS s WHERE (ph.phylome_id = %s AND ph.phylome_id = z)a.phylome_id AND ph.seed_taxid = s.taxid)rC  ri   rD  )r   r   r   r   r   r   rG   r   )r    r  rK   algsr   s        r   get_phylome_algsz$PhylomeDB3Connector.get_phylome_algs  s    
 ))z)BB A?@@@ PCCTZZZPPC@JOOC66CD K##%% K K#8;IHr**95:=k:JHr**;77Kr   c                     |                      d          r-|                     d| j                                                  S i S )z< Returns all current registered species in the database
    r-   rP   r   r   r   rG   ru   s    r   get_specieszPhylomeDB3Connector.get_species  sE     ?@@ H))&$)2D2D2F2FGGGIr   c                    |                      ||          st          d          |s|si S d}||rd||rdndndz  }||r|rdndd|d	ndz  }|                     |          r| j                                        S i S )
z5 Returns all information on a given species/code
    r   rX  z2SELECT taxid, code, name FROM species AS s WHERE (ztaxid = r   r`   z ANDz	 code = "z"))r   r   r   r   rE   )r    rO   rP   rK   s       r   get_species_infoz$PhylomeDB3Connector.get_species_info  s    
 ))54)HH /-...  i ?CEIIUU$$7BBC$788rIC$N% 7R 7 7>>BNC  "Y!!!Ir   c                     d}|dz  }|                      |          r-|                     d| j                                                  S i S )z5 Returns all current available genomes/proteomes
    zCSELECT CONCAT(code, ".",version) AS g, g.taxid, source, DATE(date),zC AS date,name FROM genome AS g, species AS s WHERE s.taxid =g.taxidgrn  r    rK   s     r   get_genomeszPhylomeDB3Connector.get_genomes  sU    
 QCPPC E))#ty/A/A/C/CDDDIr   c                 Z   |                      |          st          d          |                    d          dk    rt          d          |                    d          \  }}d}|dz  }|d|z  z  }|d	|z  z  }|                     |          r| j                                        S i S )
zJ Returns all available information about a registered genome/proteome
    r%  z&get_genome_info: Check your input datarT  r
   z5get_genome_info: Expected input 'SpeciesCode.version'zCSELECT CONCAT(code, ".", version) AS genome_id, g.taxid, s.name AS zCspecies, version, DATE(date) AS date, source, comments FROM speciesz: AS s, genome AS g WHERE(s.taxid = g.taxid AND code = "%s"z AND version = %s))r   r   countr   r   r   rE   )r    genomerP   r  rK   s        r   get_genome_infoz#PhylomeDB3Connector.get_genome_info  s    
 ))6):: @>???||CAMNNNLL%%MD'PCPPCG4PPC7++C  "Y!!!Ir   c                    |                      |          st          d          |                      |          st          d          |                      |          st          d          |rd}|dz  }||d|dz  }|dz  }nd	}|d
|z  z  }|d|z  z  }|                     |          sg S d | j                                        D             S )z Returns the phylomeDB IDs for a given genome in the database filtering
        out, or not, the different isoforms for each ID
    r_  z%get_genome_ids: Check your input datar   r1   r2   r3   r4   r5   r6   r7   r8   c                     g | ]
}|d          S )ri   r:   r*   r   s     r   r,   z6PhylomeDB3Connector.get_genome_ids.<locals>.<listcomp>  s    :::cCM:::r   r   r   r   r   rG   )r    rO   r  r;   rK   s        r   get_genome_idsz"PhylomeDB3Connector.get_genome_ids  s    ))u)== ?=>>>))w)?? ?=>>>))O)DD ?=>>>  >Oc	MMc	PPc	??ccOc	CuMMc	1W==cC   i ;:TY%7%7%9%9::::r   c                     |                      |          st          d          d}|d|z  z  }i }|                     |          r&d | j                                        D             ||<   |S )zK Return all the proteomes/genomes registered for the input taxaid code
    r_  rX  z?SELECT CONCAT(code, ".", version) AS genome FROM species AS s, z4genome AS g WHERE s.taxid = %s AND s.taxid = g.taxidc                     g | ]
}|d          S )rx  r:   r}  s     r   r,   z>PhylomeDB3Connector.get_genomes_by_species.<locals>.<listcomp>
  s    FFF#HFFFr   r~  )r    rO   rK   genomess       r   get_genomes_by_speciesz*PhylomeDB3Connector.get_genomes_by_species  s    
 ))u)== /-... MCAUKKCG GFF1C1C1E1EFFFgenNr   c                     d}|dz  }|d| j         z  z  }|dz  }|                     |          r-|                     d| j                                                  S i S )z, Returns all current available phylomes
    zCSELECT phylome_id, seed_taxid, s.name AS seed_species, CONCAT(code,zB ".", seed_version) AS seed_proteome, DATE(ts) AS date, comments, z/ph.name FROM species AS s, %s AS ph WHERE seed_ztaxid = s.taxidr  )r   r   r   r   rG   rt  s     r   get_phylomesz PhylomeDB3Connector.get_phylomes  st    
 QCOOC<OOCC  N)),	8J8J8L8LMMMIr   c                     |                      |          st          d          d}|dz  }|d| j        z  z  }|d|z  z  }|                     |          r| j                                        S i S )z6 Returns available information on a given phylome
    r_  z'get_phylome_info: Check your input datazCSELECT phylome_id AS id, seed_taxid, s.name AS seed_species, CONCATzA(code, ".", seed_version) AS seed_proteome, DATE(ts) AS date, ph.z0name, comments FROM species AS s, %s AS ph WHEREz0(ph.phylome_id = %s AND ph.seed_taxid = s.taxid))r   r   r   r   r   rE   r`  s      r   get_phylome_infoz$PhylomeDB3Connector.get_phylome_info  s    
 ))z)BB A?@@@ QCNNC=PPC=LLC "Y!!!Ir   c                 ~   |                      |          st          d          |                      |          st          d          d| j        z  }|d|z  z  }|                     |          rK| j                                        }|                     |d         |d         |          |d         |d         fS g S )	zr Returns the seed phylomeDB IDs for a given phylome being possible to
        filter out the longest isoforms
    r_  z'get_seed_ids: Check your input data (1)r{  z'get_seed_ids: Check your input data (2)z(SELECT seed_taxid, seed_version FROM %s rc  r   r$   )r   r   r   r   r   rE   r  )r    r  r;   rK   r   s        r   get_phylome_seed_idsz(PhylomeDB3Connector.get_phylome_seed_ids.  s     ))z)BB A?@@@))O)DD A?@@@ 6HC"j11C  AI  c   \!2C4G l+S-@A AIr   c                    |                      |          st          d          d}|d| j        z  z  }|d| j        z  z  }|d|z  z  }|dz  }i }|                     |          r| j                                        D ]2}|                    di                               |d	         |           3d
}|d| j        z  z  }|d|z  z  }|                     |          r'| j                                        D ]}|d	         |d<   |S )B Returns a list of proteomes associated to a given phylome_id
    r_  z/get_proteomes_in_phylome: Check your input datazBSELECT s.taxid, CONCAT(code, ".", pc.version) AS proteome, s.name,z0 source, date, pc.version FROM species AS s, %s z-AS ph, %s AS pc, genome AS g WHERE ph.phylomez2_id = %s AND ph.phylome_id = pc.phylome_id AND pc.zAtaxid = s.taxid AND pc.taxid = g.taxid AND pc.version = g.versionr   r   z;SELECT CONCAT(code, ".", ph.seed_version) AS proteome FROM z.species AS s, %s AS ph WHERE (ph.phylome_id = z%s AND ph.seed_taxid = s.taxid)r\  )r   r   r   r   r   r   rG   r   )r    r  rK   r   r   s        r   get_proteomes_in_phylomez,PhylomeDB3Connector.get_proteomes_in_phylomeF  sM   
 ))z)BB IGHHH PC=PPC:d>OPPC?:NNCNNCI 	.##%% O O#["--88Z#NNNNJc	=PPc	.*==c			#		 .9%%'' 	. 	.C!*o)F

r   c                     d| j         d|d}|                     |          r#d | j                                        D             S g S )r  zSELECT taxid from z WHERE phylome_id=""c                     g | ]
}|d          S )rO   r:   )r*   valuess     r   r,   z>PhylomeDB3Connector.get_species_in_phylome.<locals>.<listcomp>j  s    AAA&fWoAAAr   )r   r   r   rG   r`  s      r   get_species_in_phylomez*PhylomeDB3Connector.get_species_in_phylomec  s\     
 
			JJJ(C AADI,>,>,@,@AAAAir   c                    |                      |          st          d          d}|d| j        d| j        dz  }|d|                     |          z  z  }|dz  }|d	z  }i }|                     |          rY| j                                        D ]?}|                    |d
         g           	                    |d         |d         f           @|S )zh Given a list of phylomeDB IDs, return in which phylomes these IDs have
        been used as a seed
    r3  z0get_phylomes_for_seed_ids: Check your input datazBSELECT CONCAT("Phy", t.protid, "_", code) AS protid, t.phylome_id,z ph.name FROM rg  r   z%species AS s WHERE (t.protid IN (%s) z>AND t.phylome_id = ph.phylome_id AND ph.seed_taxid = s.taxid) z GROUP BY t.protid, ph.phylome_idri   r  r   )
r   r   r   r   r   r   r   rG   r   r  )r    r   rK   phylomesr  s        r   get_phylomes_for_seed_idsz-PhylomeDB3Connector.get_phylomes_for_seed_idsn  s     ))C)88 JHIII PCC$+++t~~~NNC2d6I6I#6N6NOOCKKC--CH Qy!!## Q Q!AhK,,33Q|_QvY4OPPPPOr   c                    |                      |          st          d          |                      |          st          d          |                      |          st          d          |                     |||          }d}|dz  }|d|                     |          z  z  }|d|z  z  }i }|                     |          r| j                                        D ]z}|d         d	k    r-|d
         |v r#||d
                  |d|d
         z  <   ||d
         = |d         d	k    r|d
xx         d|d         z  z  cc<   |||d
         <   ||d
                  d
= {|S )z> Returns all sequences of a given proteome, filtering the
    r_  z)get_seqs_in_genome: Check your input datar{  zDSELECT CONCAT("Phy", p.protid, "_",code) AS protid, copy, prot_name,zC gene_name, seq FROM protein AS p, species AS s, unique_protein AS z&u WHERE p.protid IN (%s) AND p.versionz3 = %s AND p.taxid = s.taxid AND p.protid = u.protidr   r
   ri   z%s_1z_%d)r   r   r  r   r   r   rG   )r    rO   r  r;   r   rK   	sequencesr   s           r   get_seqs_in_genomez&PhylomeDB3Connector.get_seqs_in_genome  s   
 ))u)== CABBB))w)?? CABBB))O)DD CABBB 

eWo
>
>C QCPPC3t7J7J37O7OPPC@GLLCI  
/##%% 	/ 	/#v;??s8}	992;CM2J)VH.
/H&v;??
h---Ec&k2
2---#&	#h- c(m$X..r   r   c                 <   |                      |          st          d          |                      ||          st          d          |                     ||          \  }}}|                     |          }d}	|	dz  }	|	| j        d| j        dz  }	|	d	z  }	|	d
|z  z  }	|	dz  }	|	d|z  z  }	|	|rd||fz  ndz  }	|                     |	          si S |                     d| j        	                                          }
|                     t          |
                                                    }d}	|	dz  }	|	d| j        z  z  }	|	d|d|dz  }	|	d|z  z  }	|                     |	          si S | j        	                                D ]7}|d         r
|d         rdn|d         s|d         rdnd|
|d                  d<   8d}	|	dz  }	|	d| j        z  z  }	|	d |d!|d"z  }	|	d#z  }	|                     |	          si S | j        	                                D ]p}|d$         |
|d                  d%<   |
|d                                      d&d           |d&         |
|d                  d&         k    r|d&         |
|d                  d&<   q|
t          |          fS )'z
    r_  z4get_phylome_seed_ids_info: Check your input data (1))r   r   z4get_phylome_seed_ids_info: Check your input data (2)z=SELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, COUNT(z<DISTINCT method) AS trees FROM (protein AS p, species AS s, z AS ph) LEFT JOIN (z
 AS t) ON zA(ph.phylome_id = t.phylome_id AND p.protid = t.protid) WHERE (ph.z3phylome_id = %s AND ph.seed_taxid = p.taxid AND ph.zBseed_version = p.version AND ph.seed_taxid = s.taxid AND p.protid z7IN (%s)) GROUP BY p.protid ORDER BY trees DESC, protid zLIMIT %d,%dr   ri   zBSELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, raw_alg IS zCNOT NULL AS r, clean_alg IS NOT NULL AS c FROM (protein AS p, speciz4es AS s) LEFT JOIN (%s AS a) ON (p.protid = a.protidz) WHERE (p.protid IN (z) AND p.version =  z,AND p.taxid = s.taxid AND a.phylome_id = %s)r  cr   r
   r   rk  zCSELECT CONCAT("Phy", p.protid, "_", s.code) AS protid, p.comments, zCgene_name, prot_name, copy, length(seq) AS l FROM protein AS p, uniz/que_protein AS u, species AS s, %s AS ph WHERE z(ph.phylome_id = z AND p.protid IN (z) z<AND ph.seed_version = p.version AND ph.seed_taxid = s.taxid)r   
seq_lengthr   )r   r   r  r   r   r   r   r   r   rG   rb   r  r   r   r   )r    r  startoffsetr;   r   rO   r  r   rK   seqsr   s               r   get_phylome_seed_ids_infoz-PhylomeDB3Connector.get_phylome_seed_ids_info  s<    ))z)BB NLMMM))uv)NN NLMMM 33JPPC!!#&&G KCIIC$...$+++NNCNNC@JOOCOOCDPPCf<=E6?**"<CC   i''$)2D2D2F2FGGD!!$tyy{{"3"344GOCPPCATZPPCCgggwwwOOC9ZHHCC   iy!!## ' '),S %'c#h %'AAs8E'3xE'AA%& 3x=&!! QCPPC<OOCC***gggNNCIICC   iy!!## 2 2*-c(d3x=,'
3x=$$VQ///	VtCM*62	2	2&)&kS]F#S>r   r   )NFF)NF)TTrp   rw   )NN)r   NF)3rx   ry   rz   __doc__r   rQ   r   r   r   r   r   r   r   r   r   r   r  r  r  r#  r*  r-  r1  r6  r  rA  rr   rK  rM  rP  rR  rV  r]  ra  re  ri  rl  ro  rq  ru  ry  r  r  r  r  r  r  r  r  r  r  r:   r   r   r   r      sP         ; ; ;
 HP", , , ,8& & &*Q Q Q  *O O O"/ / /b  .  ,  .  4  $ AE$)F F F FP  <  0' ' 'R  B  8    .# # # #J  0   4    H H H H(J J J C C CJ, , , ,\      0  *     ,
 
 
  .; ; ; ;>  "    $   0  :	 	 	  ,$ $ $ $L GK= = = = = =r   )r  
__future__r   rY   r>   r   r   r   r   r
  	six.movesr   r   compiler   r   rb   r   	frozensetr   __all__objectr   r|   r   r:   r   r   <module>r     sT  R . ' & & & & & 				              



         RZ899
dC	233 
!c	 c	 c	 c	 c	f c	 c	 c	L,	 ,	 ,	 ,	 ,	 ,	 ,	 ,	f} } } } }& } } } } }r   