#!/usr/bin/env python3

# Needs a folder full of kraken .out files to search through, and an input file with TaxIDs to search for. 
# The TaxID input file can have comments (lines that start with #) and multiple data elements per line. 
# The first element on each line is assumed to be a TaxID, the rest are ignored here. 

# Example: python search_for_id.py -d nt-0.12-lrcr/ -i ids.txt
# will search each file inside directory nt-0.12-lrcr/ to look for any IDs listed in ids.txt:
# 
# # Things to look for 
# 13750 cranberry
# 4079    henbane
# 41893   riverbank grape Vitis riparia
# 91214   butternut   Juglans cinerea

import sys, os, argparse

def main(PATH, q):
    if os.path.isdir(PATH): # if path goes to a directory
        if PATH[-1] != "/":
            PATH = PATH + "/"

        for subdir, dirs, files in os.walk(PATH):
            for file in files:
                filepath = subdir + file
                extension = os.path.splitext(filepath)[1] # get extension

                if extension.lower() == ".out" and file[0] != ".": # check if extension is json and file is valid
                    process_file(filepath,q)

def process_file(filepath, q):
    ids = []

    with open(q, 'r') as fq:
        for line in fq:
            # Split the line on whitespace (default separator) and get the first element
            first_token = line.split(None, 1)[0]
            
            if (first_token[0] == '#'):
                continue

            # Append the first token to the list
            ids.append(first_token.rstrip())  # Remove trailing newline character    

    f = open(filepath, "r")
    text = f.read()

    for line in text.split("\n"):
        attrs = line.split()

        if len(attrs) > 4:
            if str(attrs[4]) in ids:
                #print('ID ' + attrs[4] + ' located in file: "' + filepath + '"')
                print(", ".join(attrs) + ", " + filepath)

    f.close()
                    

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Search for TaxID")
    parser.add_argument('-d', '--dir', help='Enter the directory to check', required=True, dest="dir")
    parser.add_argument('-i', '--input', help='Enter a path to a file containing taxids in a list', required=True, dest="input")
    args = parser.parse_args()

    main(args.dir,args.input)
