/* Program tacg - a command line tool for Restriction Enzyme digests of DNA  */
/* Copyright  1996 Harry J Mangalam, University of California, Irvine (mangalam@uci.edu, 714 824 4824) */

/* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
   is bound by the notice that appears in the file 'tacg.h' which should accompany this file.  In the event 
   that 'tacg.h' is not bundled with this file, please contact the author.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include "tacg.h" /* contains all the defines, includes, function prototypes for both main() and functions */


/*  compare is a dippy little function that qsort needs to perform its sordid little sort - works! */
int compare( const void *n1, const void *n2 ){
        return ( *((int *) n1) - *((int *) n2) );
}


/********************************  Function Translate  *****************************

Function Translate - takes as input a pointer to a string of nondegenerate 
DNA and returns the translated sequence as protein in either 1 or 3 letter 
code.  Will translate according to a number of translation tables stored as 
3D array of pointers in Codons [organism][codon][label].  The coordinate 
points to a char string that holds the single letter xl at pos '0' and the 
3 letter xl starting at pos 1.  Does not do 3 frame translation - have to 
do 3 calls to it for that.  Also, does not do any output formatting - have 
to handle that in the print section...  JUST translates.  However, will 
truncate the translated string to the correct length if not a multiple of 3

**********************************************************************************/

void Translate (char *DNA_in, char *Prot_out, int len, int n_letters, char Codons[8][64][4], int organism) {
   /* DNA_in   pointer to the DNA seq that needs to be translated, typically the entire DNA array,
               with the correct offset precalculated so that the pointer refers to the correct start
               site
      Prot_out pointer to the array that is filled by Translate() in either 1 or 3 letter code,
               typically passed to the print routines to be instantly printed out.
      len      the length of the DNA sequence to be translated
      n_letters   the number of letters of the aa label - A vs Ala
      Codons   array that holds all the codon preference data for the different organisms or mitos
               loaded by subroutine Read_Codon_Prefs() in tacg.xf.c from external file "codon.prefs"
               that has to be in the same dir as program currently.
      organism var that indicates which organism's codon prefs should be used for this translation
   */

   int sum[256], i;
   if (len % 3 != 0) len = ((int) len /3)* 3;   /* round len to a multiple of 3 */
   memset (Prot_out,' ',len); /* set Prot_out to blanks */
   for (i=0; i<len; i += 3) {  /* for the length of the DNA sequence */
      hash(DNA_in +i, sum, 3); /* hash the next triplet */
      if (n_letters == 1)  {
         Prot_out[i] = Codons[organism][sum[0]][0]; /* look up the correct value and plunk it into the output */
      } else memcpy (&Prot_out[i], &Codons[1][sum[0]][1],3); /* or memcpy the 3 letter code over */
   }
}


/*******************************  Function Read_Codon_Prefs  ***************************************
*                                                                                                  *
*      Read_Codon_Prefs reads in the file that has the codon preferences in human-readable form    *
*      then hashes the codons and inserts the corresponding label into the array "Codons" using    *
*      the index calculated by hashing the triplet (just as the program will do in order to        *
*      translate a sequence of DNA.  The file also has a description line for each table that      *
*      is read into a char array for labelling purposes.                                           *
*                                                                                                  *
***************************************************************************************************/
void Read_Codon_Prefs (char Codons[8][64][4], char Codon_Labels[8][20]) {
/* Codons           array that holds all the codon preference data for the different organisms or mitos
                                loaded from external file "codon.prefs"
   Codon_Labels   array that holds the organism labels (Universal, various Mitos, etc.
*/
   FILE *fpCodon; 
   int i, j, k, sum[256];
   char triplet[3], labels[4], *ctmp;
   for (i=0;i<3;i++)triplet[i] = 'a';

/*   fprintf(stderr, "\n\n (BEFORE call to SearchPaths - 1st 200 bps of sequence:");
   for (i=0;i<200;i++) fprintf(stderr, "%c", sequence[i]);   */

   if ((ctmp = (SearchPaths("codon.prefs", "CODON PREFS"))) == NULL) {
      fprintf(stderr,"Can't open the CODON PREFS file!! - Check spelling, ownership, existance, etc - Bye!!\n");
      exit (1);
   } /* else  fprintf(stderr,"Going to try to read \"%s\"(in function Read_Codon_Prefs)!!\n", ctmp);     */

/*   fprintf(stderr, "\n\n (AFTER call to SearchPaths - 1st 200 bps of sequence:");
   for (i=0;i<200;i++) fprintf(stderr, "%c", sequence[i]);   */


/*  open the    Codon   Preference input file  */
  if    ((fpCodon=fopen(ctmp,"r"))  ==  NULL)   {   /*  if  it's not    there   and readable */
                fprintf(stderr,"Cannot open the file \"codon.prefs\" for    reading (in function    Read_Codon_Prefs)!!\n"); 
                exit(1); /* print   an  error   and die gracefully */
        } 

   for (i=0;i<8;i++) { /* for each different Codon Pref stanza - also hard-coded to 8; will never change */
      fscanf(fpCodon,"%s", ctmp); /* stuff the stanza header string into the label array */  
      strcpy (Codon_Labels[i],ctmp);
      for (j=0;j<64;j++) { /* for each doublet of values in the stanza (64 in all) */
        /* The next 2 lines ! work with gcc on Linux if the -O2 flag is used.  if no -O2 flag, it's OK ?!?!? */   
        fscanf(fpCodon,"%3s %4s",ctmp, labels); /* grab the triplet and the corresp. label */
        /* fprintf(stderr,"fscanf returns : %d; The triplet is %s\n", l, ctmp);  */
        for (k=0;k<3;k++) triplet [k] = tolower(ctmp[k]);  /* should change 'tolower' to a macro */   
        hash(triplet,sum,3); /* hash the triplet to use as an index as to where to put the label */
        memcpy(&Codons[i][sum[0]][0],&labels,4); /* and copy the label to the right place in "Codons" */
      }
   }
   fflush(stderr);
}


/*******************************  Function Degen_Cmp  *************************************
*   Degen_Cmp compares 2 strings, the 1st a degenerate sequence (typically a              *
*   restriction enz or transcription factor binding site) and the 2nd a pure              *
*   (non-degenerate) seq, typically the sequence that is being analyzed.  The pointers    *
*   passed to the fuction point to the initial base.  Hopefully this can be done for      *
*   the pure seq by passing a pointer to the nuc at the current point in a much larger    *
*   string/array.                                                                         *
******************************************************************************************/

int  Degen_Cmp (char *d_seq, char *p_seq, int len)  {
/* d_seq    pointer to the degenerate sequence that is going to be used as a probe
   p_seq    pointer to the non-degenerate seq that is going to be used as the target or database
            into which the probe sequence might map
   len      length of the probe seq or how many steps have to be checked
*/
   int same = 1;   /* same = 1 if dgen seq can map onto the pure seq, 1 otherwise */
   int m;
   for (m=6;m<len;m++) { /* start at 6 - we know that it is a match to 6 letters */
      switch (d_seq[m]) { /* or we wouldn't be here */
        case 'a': if (p_seq[m] != 'a') same = 0; break;
        case 'c': if (p_seq[m] != 'c') same = 0; break;
        case 'g': if (p_seq[m] != 'g') same = 0; break;
        case 't': if (p_seq[m] != 't') same = 0; break;
        case 'y': if (p_seq[m] != 'c' && p_seq[m] != 't') same = 0; break;
        case 'r': if (p_seq[m] != 'a' && p_seq[m] != 'g') same = 0; break;
        case 'm': if (p_seq[m] != 'a' && p_seq[m] != 'c') same = 0; break;
        case 'k': if (p_seq[m] != 'g' && p_seq[m] != 't') same = 0; break;
        case 'w': if (p_seq[m] != 'a' && p_seq[m] != 't') same = 0; break;
        case 's': if (p_seq[m] != 'c' && p_seq[m] != 'g') same = 0; break;
        case 'b': if (p_seq[m] == 'a') same = 0; break;
        case 'd': if (p_seq[m] == 'c') same = 0; break;
        case 'h': if (p_seq[m] == 'g') same = 0; break;
        case 'v': if (p_seq[m] == 't') same = 0; break;
        case 'n': break;   /* n = anything */
      }
   }
   return same; /* if same = 1, the degen maps onto the pure seq */
}  /* if same = 0 , the degen cannot map onto the pure seq - ie they're different */




/*********************************  Function Anti_Par  ********************************
*   Anti_Par takes 2 char pointers, the 1st to the original seq, the 2nd to the       *
*   seq that it generates that is the reverse complement, and the length of the seq   *
*   to consider.                                                                      *
**************************************************************************************/

void Anti_Par (char *ori, char *anti, int len)  {
/* ori   pointer to the beginning of the original seq
   anti  pointer to the beginning of the converted anti parallel sequence
   len   the length of the sequence, both original and converted
*/
   int chop =len-1;
   int m;
   for (m=0;m<len;m++) {
      switch(ori[m]) {
         case 'a': anti[chop-m] = 't'; break;
         case 'c': anti[chop-m] = 'g'; break;
         case 'g': anti[chop-m] = 'c'; break;
         case 't': anti[chop-m] = 'a'; break;
         case 'r': anti[chop-m] = 'y'; break;
         case 'y': anti[chop-m] = 'r'; break;
         case 'w': anti[chop-m] = 'w'; break;
         case 's': anti[chop-m] = 's'; break;
         case 'm': anti[chop-m] = 'k'; break;
         case 'k': anti[chop-m] = 'm'; break;
         case 'b': anti[chop-m] = 'v'; break;
         case 'd': anti[chop-m] = 'h'; break;
         case 'h': anti[chop-m] = 'd'; break;
         case 'v': anti[chop-m] = 'b'; break;
         case 'n': anti[chop-m] = 'n'; break;
         default:  /* bad character detection */
            fprintf(stderr,"Acck! In Anti_Par(), I don't like char# %d= %c! \n",m, ori[m]); break;
      }  /* end of switch/case statement */
   }
   anti[len] = '\0';
   fflush(stderr);
}


/***********************  Function Rev_Compl  *****************************************
*   Rev_Compl takes 2 char pointers, the 1st to the original seq, the 2nd to the      *
*   seq that it generates that is the reverse complement, and the length of the seq   *
*   to consider.                                                                      *
**************************************************************************************/

void Rev_Compl (char *ori, char *rev, int len)  {
/* variables same as in Anti_Par, above */
   Anti_Par (ori,rev,len);
   Reverse (rev);
}


/********************  Function reverse  ********************************
* straight from K+R (p62) - reverses a string s in place                *
************************************************************************/

void Reverse (char *s)  {
/* s  pointer to the beginning of the array that holds the seq to be reversed */
   int i,j;
   char c;
   for (i=0,j=strlen(s)-1; i<j; i++,j--) {
      c = s[i];     s[i] = s[j];      s[j] = c;
   }
}

/* Function Triplet_Reverse reverses a string triplet by triplet ie:
        ArgTrpPheAsnCys ==> CysAsnPheTrpArg  so as to make the 6 frame translations readable in the oppo
        orientation */
void Triplet_Reverse (char *str) {
        int i, j, mid, length, itmp;
        char tmp[3];
 /*           fprintf (stderr, "\n");  */

                          length =  strlen(str);
        mid = (int)(length / 2);
/*        if (reps % 3 != 0) reps = ((int) reps /3)*3; */ /* just to make sure */
        for (i=0;i<=mid; i+=3) {
/*            fprintf (stderr, "%s\n", str);             */
                for (j=0;j<3;j++) tmp[j] = str[i+j];
                for (j=0;j<3;j++) {
                        itmp = length - i - 3 + j;
                        str[i+j] = str[itmp];
                        str[itmp] = tmp[j];
                }
        }
}


/*************************  Function hash  *******************************************************
*   function hash takes a pointer to the n-mer string and generates the                          *
*   integer equivalent degeneracies that n-mer can expand to (max of 256 in                      *
*   this instance (with a hexamer, and returns a pointer to that array of numbers (sum, below),  *
*   along with the integer number of degeneracies generated                                      *
*************************************************************************************************/
int hash (char *nmer, int sum[], int num) { 
/* nmer  pointer to begin of array that holds the nmer to be 'hashed'
   sum   array (assigned in main() to be [256]) that holds the all the possible variants of the 
         hashed sequence
   num   length of the nmer to be hashed; yes I know it's redundant...
*/
   int key[6] = { 1, 4, 16, 64, 256, 1024};
         int el, h, N_degen, t_degen, degen, i;
   degen = N_degen = 1;
   memset(sum,0,256);  /* set all of 'sum' to 0 - should work but any faster?*/

   /* Big 'for' loop that calculates the hexamer, gets executed at every RE site, */
   for (el = 0;  el < num;  el++){        /* and at every overlapping n-mer */
      switch (nmer[el])  { /* pointer passed to function already offset to starting position */
      /* a=0, c=1, g=2, t=3, degenerates are handled below */
      case 'a':   break;   /* not really needed - (a) = 0 so no change in sum */
      case 'c': for (i=0; i<degen; i++) sum[i] = sum[i] + key[el];  break;
      case 'g': for (i=0; i<degen; i++) sum[i] = sum[i] + 2*key[el];  break;
      case 't': for (i=0; i<degen; i++) sum[i] = sum[i] + 3*key[el];  break;

      /*  !!!! Now the degeneracies !!!!  */
      /* Double degeneracies  */

      case 'y':   /* c or t  */  
         N_degen = degen*2; t_degen = N_degen-1;
         fill_out_sum (degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
            sum[i] = sum[i] + key[el];   /* counting up sum, incr for 'c'*/
            sum[t_degen-i] = sum[t_degen-i] + 3*key[el]; /* counting down sum,*/
         }  break;                                 /*    incr for 't' */
      case 'r':   /* g or a */
         N_degen = degen*2; 
         fill_out_sum(degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + 2*key[el];    /* counting up sum, incr for 'g' */
         }  break;
      case 'm':   /* a or c */
         N_degen = degen*2; 
         fill_out_sum(degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + key[el];    /* counting up sum, incr for 'c' */
         }  break;
      case 'k':   /* g or t */
         N_degen = degen*2;   t_degen = N_degen-1;
         fill_out_sum(degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
            sum[i] = sum[i] + 2*key[el];                 /* counting up sum, incr for 'g' */
            sum[t_degen-i] = sum[t_degen-i] + 3*key[el];  /* counting down sum, incr for 't' */
         }  break;
      case 's':   /* c or g */
         N_degen = degen*2;    t_degen = N_degen-1;
         fill_out_sum(degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
            sum[i] = sum[i] + key[el];                /* counting up sum, incr for 'c' */
            sum[t_degen-i] = sum[t_degen-i] + 2*key[el];  /* counting down sum, incr for 'g' */
         }  break;
      case 'w':   /* a or t */
         N_degen = degen*2;   
         fill_out_sum(degen, N_degen, sum);
         for (i=0; i<(degen); i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + 3*key[el];    /* counting up sum, increment for 'c' */
         }  break;

      /* Triple degeneracies  */
      case 'b':   /* not a - so c, g, or t */
         h = 2*degen;   
         N_degen = degen*3;  
         fill_out_sum(degen, N_degen, sum);
         /* Increment all the array values in sum, based on the degeneracies */
         for (i=0; i<degen; i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + key[el];    /* counting up sum, increment for 'c' */
            sum[i+degen] = sum[i+degen] + 2*key[el];    /* counting up sum, increm for 'g' */
            sum[i+h] = sum[i+h] + 3*key[el];    /* counting up sum, incr for 't' */
         } break;
      case 'd':   /* not c - so a, g, or t */
         h = 2*degen;    
         N_degen = degen*3;  
         fill_out_sum(degen, N_degen, sum);
         /* Increment all the array values in sum, based on the degeneracies */
         for (i=0; i<degen; i++)  {
         /*  'a' will give 0 - no change */
            sum[i+degen] = sum[i+degen] + 2*key[el];    /* counting up sum, incr for 'g' */
            sum[i+h] = sum[i+h] + 3*key[el];    /* counting up sum, incr for 't' */
         } break;
      case 'h':   /* not g - so a, c, or t */
         h = 2*degen;   
         N_degen = degen*3;  
         fill_out_sum(degen, N_degen, sum);
         /* Increment all the array values in sum, based on the degeneracies */
         for (i=0; i<degen; i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + key[el];    /* counting up sum, increment for 'c' */
            sum[i+h] = sum[i+h] + 3*key[el];  /* counting up sum, incr for 't' */
         } break;
      case 'v':   /* not t - so a, c, or g */
         h = 2*degen;   
         N_degen = degen*3;  
         fill_out_sum(degen, N_degen, sum);
         /* Increment all the array values in sum, based on the degeneracies */
         for (i=0; i<degen; i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + key[el];    /* counting up sum, increment for 'c' */
            sum[i+degen] = sum[i+degen] + 2*key[el]; /* counting up sum, incr for 'g' */
         } break;

      /* And the big old quadruple degeneracy  */
      case 'n':   /* a,c,g, or t */
         h = 2*degen;   
         N_degen = degen*4;      /* t_degen = N_degen-1; */
         fill_out_sum(degen, N_degen, sum);
         /* Increment all the array values in sum, based on the degeneracies */
         for (i=0; i<degen; i++)  {
         /*  'a' will give 0 - no change */
            sum[i] = sum[i] + key[el];    /* counting up sum, increment for 'c' */
            sum[i+degen] = sum[i+degen] + 2*key[el];    /* counting up sum, increment for 'g' */
            sum[i+h] = sum[i+h] + 3*key[el];    /* counting up sum, increment for 't' */
         } break;
      default:  /* bad character detection */
      fprintf (stderr, "Acck! in hash(), I don't like %c (at el = %d of %d)! \n", nmer[el], el, num); 
      break;
      }  /* end of switch/case statement */
      degen=N_degen;
   }  /* end of big 'for' loop */
/* fprintf(stderr, "fsum= %d\n", sum[0]); */
   fflush(stderr);
   return degen;
}  

/***************************  Function Palindrome  ********************************
*   Function definition of palindrome - returns 1 if the sequence is a pal,       *
*   0 if it's not - fails on 1st nonpal character, so ought to fail quickly if    *
*   it is going to.                                                               *
**********************************************************************************/

int palindrome (char *site, int length) {
/* site     the site to be 'palindromed'; name should really be more generic
   length   the length of the sequence to be palindromed; yes, redundant, but convenient
*/
   int pal = 1, i = 0, halflength;
   if (length%2 == 0) halflength = length/2;
   else halflength=(length/2)+1;
   length--; /* to make the '*(site+(length-i)' expresssion work */
   while ((pal==1) && i<halflength) {
      switch (*(site+i)) {
         case 'a': if (*(site+(length-i)) != 't') pal=0; break;
         case 'c': if (*(site+(length-i)) != 'g') pal=0; break;
         case 'g': if (*(site+(length-i)) != 'c') pal=0; break;
         case 't': if (*(site+(length-i)) != 'a') pal=0; break;
         case 'y': if (*(site+(length-i)) != 'r') pal=0; break;
         case 'r': if (*(site+(length-i)) != 'y') pal=0; break;
         case 'm': if (*(site+(length-i)) != 'k') pal=0; break;
         case 'k': if (*(site+(length-i)) != 'm') pal=0; break;
         case 'w': if (*(site+(length-i)) != 'w') pal=0; break;
         case 's': if (*(site+(length-i)) != 's') pal=0; break;
         case 'b': if (*(site+(length-i)) != 'v') pal=0; break; 
         case 'd': if (*(site+(length-i)) != 'h') pal=0; break;
         case 'h': if (*(site+(length-i)) != 'd') pal=0; break;
         case 'v': if (*(site+(length-i)) != 'b') pal=0; break;
         case 'n': if (*(site+(length-i)) != 'n') pal=0; break;
         default: fprintf(stderr,"palindrome doesn't like %c! \n", *(site+i)); break;
      }  /* end of switch/case statement */
      i++;
   }
   fflush(stderr);
   return pal;
}

/***************************  Function fill_out_sum  *************************************
*   Function definition for fill_out_sum - duplicates the degeneracy the correct # of    *
*   times in the array 'sum[]'.                                                          *
*****************************************************************************************/
void fill_out_sum (int O_dgen, int N_dgen, int s[])  {  
/* O_dgen   old degeneracy that needs to be updated to the new degeneracy defined in...
   N_dgen   new degeneracy that's calculated here
   s        local version of sum that holds all the degeneracy values calculated here  */
   int i = 0, j = 0;
   for (i=O_dgen; i<N_dgen; i=i+O_dgen) {
      do  {
         s[i+j] = s[j]; j++; 
      } while (j<O_dgen);
      j = 0;
   }
}
