#!/usr/bin/env python3

# PROCESS_FASTP_JSON.PY
#
#   Creates a CSV output of a group of json reports from fastq files.
#   Example: python3 fastp_combined_report.py /path/do/json/folder
#
import sys, os, json, csv

def main(PATH):
    out_data = [ # left-most row is data headers
        ["file name"],
        ["fastp version"],
        ["sequencing"],
        ["mean length before filtering"],
        ["duplication rate"],
        ["insert size peak"],
        ["total reads (before filtering)"],
        ["total bases (before filtering)"],
        ["Q20 bases (before filtering)"],
        ["Q30 bases (before filtering)"],
        ["GC content (before filtering)"],
        ["total reads (after filtering)"],
        ["total bases (after filtering)"],
        ["Q20 bases (after filtering)"],
        ["Q30 bases (after filtering)"],
        ["GC content (after filtering)"],
        ["reads passed filters"],
        ["reads corrected"],
        ["bases corrected"],
        ["reads with low quality"],
        ["reads with too many N"],
        ["reads too short"],
        ["reads with low complexity"]
    ]
    json_count = 0
    # Iterate over all subfolders in the supplied path
    for subdir, dirs, files in os.walk(PATH):
        for file in files:
            filepath = subdir + file
            extension = os.path.splitext(filepath)[1] # get extension

            if extension.lower() == ".json" and file[0] != ".": # check if extension is json and file is valid
                json_count += 1
                f = open(filepath)
                data = json.load(f)
                version = data['summary']['fastp_version']
                seq = data['summary']['sequencing']
                before_filtering = data['summary']["before_filtering"]
                after_filtering = data['summary']["after_filtering"]
                filtering_result = data["filtering_result"]
                
                duplication_rate = data['duplication']['rate']
                insert_size_peak = data['insert_size']['peak']

                out_data[0].append(file) # file name
                out_data[1].append(version) # fastp version
                out_data[2].append(seq) # sequencing
                out_data[3].append(str(before_filtering["read1_mean_length"]) + "bp, " + str(before_filtering["read2_mean_length"]) + "bp") # mean length before filtering
                out_data[4].append(str(round(duplication_rate * 100, 4)) + "%") # dup rate
                out_data[5].append(insert_size_peak) # insert size peak
                out_data[6].append(before_filtering['total_reads']) # before filtering - total reads
                out_data[7].append(before_filtering['total_bases']) # before filtering - total bases
                out_data[8].append(before_filtering['q20_bases']) # before filtering - Q20 bases
                out_data[9].append(before_filtering['q30_bases']) # before filtering - Q30
                out_data[10].append(before_filtering['gc_content']) # before filtering - GC content
                out_data[11].append(after_filtering['total_reads']) # after filtering - total reads
                out_data[12].append(after_filtering['total_bases']) # after filtering - total bases
                out_data[13].append(after_filtering['q20_bases']) # after filtering - Q20 bases
                out_data[14].append(after_filtering['q30_bases']) # after filtering - Q30
                out_data[15].append(after_filtering['gc_content']) # after filtering - gc content
                out_data[16].append(filtering_result['passed_filter_reads']) # reads passed filter
                out_data[17].append(filtering_result['corrected_reads']) # reads corrected
                out_data[18].append(filtering_result['corrected_bases']) # bases corrected
                out_data[19].append(filtering_result['low_quality_reads']) # reads with low quality
                out_data[20].append(filtering_result['too_many_N_reads']) # reads with too many N
                out_data[21].append(filtering_result['too_short_reads']) # reads too short
                out_data[22].append(filtering_result['low_complexity_reads']) # reads with low complexity
    with open("out.csv", "w", newline="") as f:
        if json_count != 0:
            writer = csv.writer(f)
            writer.writerows(out_data)
            print("SUCCESS: Wrote data from " + str(json_count) + " .json files into 'out.csv'.")
        else:
            print("ERROR: No .json files found in specified directory.")

if __name__ == '__main__':
    if len(sys.argv) > 1:
        main(sys.argv[1])
    else:
        print("ERROR: No path was given. Please specify a path:\n\n     $ python3 process_fastp_json.py /path/to/json/directory/\n")

