'''
Copyright (c) 2014, University of Illinois at Urbana-Champaign
All rights reserved.

Developed by:

    Dr. Rodriguez-Zas's Laboratory of Statistical Genetics and Bioinformatics
    University of Illinois at Urbana-Champaign
    http://ansci.illinois.edu/labs/bioinformatics

Permission is hereby granted, free of charge, to any person obtaining a copy of this 
software and associated documentation files (the Software), to deal with the Software 
without restriction, including without limitation the rights to use, copy, modify, merge, 
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
to whom the Software is furnished to do so, subject to the following conditions:

Redistribution and use in source and binary forms, with or without modification, 
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this 
   list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this 
   list of conditions and the following disclaimer in the documentation and/or other 
   materials provided with the distribution.
3. Neither the names of Dr. Rodriguez-Zas's Laboratory of Statistical Genetics and 
   Bioinformatics, University of Illinois at Urbana-Champaign, nor the names of its 
   contributors may be used to endorse or promote products derived from this Software 
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 
DAMAGE.
'''


import random
import re
import sys
import argparse
import mimetypes
import os
import itertools


# list of 20 standard amino acids
OneAAlist=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'K','I', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# arguments
def arguments():
    parser = argparse.ArgumentParser(description='Generate k-permuted decoy peptides')
    parser.add_argument('-perm', type=int, metavar='<Integer>', dest='permutations', default=100,
                     help='The number of random permutations for each sequence (default=100)')
    parser.add_argument('-seed', metavar='<Integer>', dest='seed', type=int, default=3343,
                     help='Initialize the random number generator (default=3343)')
    parser.add_argument('-max-length', metavar='<Integer>',dest='max_length', type=int, default=7,
                     help='The maximum peptide length when all possible permutations can be used (default=7)')
    parser.add_argument('-in', metavar='<input_file>', dest='infile', type=str, required=True,
                     help='Input file name')
    parser.add_argument('-out', metavar='<output_file>',dest='outfile', type=str, default="decoys.fasta",
                     help='Output file name (default=decoys.fasta)')
    parser.add_argument('-iupac', metavar='<True or False>', dest='iupac', type=str, default='False',
                     choices=['False','True'],
                     help='''Generate permuted peptides with Any(X), Asparagine (B), and Glutamine (Z)
                     amino acids in addition to standard 20 amino acids (default=False)''')
    return parser.parse_args()
    

# fix end of line (EOL) character
def endOfLine(lines):
    return lines.replace('\r\n', '\n').replace('\r', '\n')

    
# read peptides from the fasta file 
def load_sequences(sequences, uniquesequences, fasta_identifier, pepsequence):
    peplength = len(pepsequence)
    if sequences.get(peplength) and (pepsequence not in uniquesequences):
        previous_set = sequences.get(peplength)
        previous_set.add((pepsequence,fasta_identifier[:30]))
        sequences.update({peplength:previous_set})
        uniquesequences.add(pepsequence)
    elif pepsequence not in uniquesequences:
        init_set = set()
        init_set.add((pepsequence,fasta_identifier[:30]))
        sequences.update({peplength:init_set})
        uniquesequences.add(pepsequence)
    else:
        print ('Removing duplicate sequence: %s' %fasta_identifier[:30])
    return sequences, uniquesequences


# validate input file
def validate_inputfile(filename):
    # dict to save provided unique sequences
    sequences = {}
    # unique target peptides
    uniquesequences = set()
    # generate regular expression to check for IUPAC codes
    valid_aa = re.compile("[^ABCDEFGHIKLMNPQRSTVWXYZ]")
    if not os.access(filename, os.R_OK):
        print ('Input file is not readable')
        return False,sequences,uniquesequences
    elif not ((mimetypes.guess_type(filename)[0]=='text/plain') or mimetypes.guess_type(filename)[0]==None):
        print ('Input file is not a text\plain file')
        return False,sequences,uniquesequences
    else:
        fin = re.split('>', endOfLine(open(filename).read()))
        if fin[1].find('\n') == -1:
            print ('Invalid fasta format: carriage return missing from the lines')
            return False,sequences,uniquesequences
        seq_num = 0
        for fasta in fin[1:]:
            seq_num += 1
            mlines = re.split('\n', fasta.strip())
            if len(mlines) < 2:
                print ('Invalid fasta format for sequence number: %s' %seq_num)
                return False,sequences,uniquesequences
            sequence = ''.join(mlines[1:])
            if len(sequence) == 0:
                print ('Invalid fasta format for sequence number: %s' %seq_num)
                return False,sequences,uniquesequences
            if valid_aa.search(sequence): 
                print ('Invalid characters in sequence number: %s' %seq_num)
            else:
                sequences, uniquesequences = load_sequences(sequences, uniquesequences, mlines[0],sequence)
    return True,sequences,uniquesequences


# print/write randomly generated peptides
def writepeptides(fpeps,decoys_current_peptide):
    for pepseq in decoys_current_peptide:
        fpeps.write('>perm:%s|length:%s\n%s\n' %(pepseq[0],len(pepseq[1]),pepseq[1]))
    

# random sampling function to generate decoy peptides
def randomSampling(peptide,fasta_identifier,uniquesequences,fpeps,finalDB,permutations,generator,possible_dbsize):
    curr_size = len(finalDB)
    decoys_current_peptide = set()
    while (len(finalDB) < possible_dbsize) and (len(finalDB) < (curr_size+permutations)):
        sampledAAlist = []
        for i in peptide:
            # sample with replacement from OneAAlist, One residue at a time
            aminoacid = generator.sample(OneAAlist, 1)
            sampledAAlist.append(aminoacid[0])
        pepseq = ''.join(sampledAAlist)
        # ensure that decoy database do not contain any target peptide
        if (pepseq not in uniquesequences) and (pepseq not in finalDB):
            finalDB.add(pepseq)
            decoys_current_peptide.add((fasta_identifier,pepseq))
    writepeptides(fpeps,decoys_current_peptide)
    if len(finalDB) >= possible_dbsize:
        print ('Cannot generate more decoy peptides of length: ', len(peptide))
        return False
    return True,finalDB,fpeps


# generate and write all peptides that are <= max-length parameter
def shorter_peptides(fpeps,countPerm,peptide_size,uniquesequences):
    for pep in itertools.product(OneAAlist, repeat=peptide_size):
        pepseq=''.join(pep)
        if pepseq not in uniquesequences:
            countPerm += 1
            fpeps.write('>perm:%s|peptide_length:%s\n%s\n' %(countPerm,len(pepseq),pepseq))
    return countPerm

# get peptides using samling function
def getpeptides(sequences, uniquesequences, args):
    countPerm = 0
    permutations = args.permutations
    # fixed seed so that the results are reproducible
    generator = random.Random(args.seed)
    # check write permission for the output file
    if not os.access(os.path.abspath(os.path.dirname(args.outfile)), os.W_OK):
        print ('Output file cannot be generated: permission denied')
        return
    fpeps = open(args.outfile,'w')
    for k,v in sequences.items():
        # check that the number of permutations don't exceed possible database size
        possible_dbsize = pow(20, k) - len(v)
        # generate all possibilities for peplength <= max-length
        if ((len(v)*permutations) >= possible_dbsize) and (k <= args.max_length):
            if args.max_length > 7:
                print ('WARNING: all possible permutations of peptides greater than seven amino acids in length are very large')
            countPerm = shorter_peptides(fpeps,countPerm,k,uniquesequences)
        elif ((len(v)*permutations) < possible_dbsize):
            finalDB = set()
            for peptide in v:
               bool_possible_size,finalDB,fpeps=randomSampling(peptide[0],peptide[1],uniquesequences,fpeps,
                                                 finalDB,permutations,generator,possible_dbsize)
               if not bool_possible_size:
                    break
            finalDB.clear()
    fpeps.close()

## main section    
args = arguments()
# update oneAAlist IUPAC permutation option is selected
bool_types = {'True':True, 'False':False}
if bool_types.get(args.iupac):
    OneAAlist.append('X')
    OneAAlist.append('B')
    OneAAlist.append('Z')
fasta,sequences,uniquesequences = validate_inputfile(args.infile)
if fasta or len(sequences) > 0:
    getpeptides(sequences, uniquesequences, args)
