/* Program tacg - a command line tool for Restriction Enzyme digests of DNA  */
/* Copyright  1996 Harry J Mangalam, University of California, Irvine (mangalam@uci.edu, 714 824 4824) */

/* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
   is bound by the notice that appears in the file 'tacg.h' which should accompany this file.  In the event 
   that 'tacg.h' is not bundled with this file, please contact the author.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h> 
#include "tacg.h"   /* contains all the defines, includes, function prototypes for both main() and functions; 
                      simple enuf not to have a version #; I'll just keep it up to date as I go...*/


main(int argc, char *argv[]) {              
   FILE *fpinRE;
/* Key variable descriptions
i,j, k,l,m,mm,li  : general counters; 
li,lj       long counters to allow large counts
key         array that holds the keys for generating the hash values of the DNA hexamers
el          counter for 'key[]', above
sum         array for keeping score of the degenerate sums
degen       degeneracy
NumRES      the calculated # of REs (as opposed to MAX_NUM_RES, the maximum #). 
            Due to the problem of handling nonpalindromes, the 1st of the RE array will start at 10, 
            not at 0, so further references will have to consider this little quirk.
baseK       constant for determining the values for the dif bases
t_degen     top of the degen array (usually degen-1)
RE_rawsite  temp char array
s           temp pointer to char
nucleotide  the current nucleotide for switch statement
*chk_sits[] the array of pointers that parallels hashtable to keep track of what RE cut at each 
            point hash value. provides an index into the RE struct (what to check if the hashed 
            value of the sequence hits the hashtable value.
*Dig_Sits[] of size (# of REs) that points to the array of #s indicating where in the 
            sequence an RE cut (keeps the data that allows building RE maps and fragments); it's organized so that
            the 0th el points to the next free element so 'real data' starts at Dig_Sits[x][1].
*Frag_Siz[] array that keeps track of the fragments generated by the cutting- could be done on the fly 
            for the current implementation, but will be needed soon, so do it right the 1st time. (sister of 
            *Dig_Sits[], 0th element is used  as a pointer to the end.
*Dig_Dat    the array (init'ed to 10,000 (enough for a sequence of about 50,000) 
            and calloc'ed larger if needed) that keeps the data as it is generated for later 
            sorting and use in *Dig_sits[] (see above).  It's format is:
            base cut site|RE#|base cut site|RE#|base cut site|RE#|base cut site|RE#| (no -1 demarcations)
*Codons[][] the array that holds the codon preference data - list of labels indexed by hash of the corresponding
            codon triplet.
*Codon_Labels[]  Array that holds the organism labels of the different codon preference tables     
D_D_Cnt     counter for dig_dat[]
D_D_Siz     Size of Dig_Dat[]
Cur_RE      'Current RE Number' - simplifying var for "chk_sits[dgn_sits[0]][m]" in printing bit
Base_cutsite   the base site at which the RE cuts (not considering the offset (RE[].E_tcut) which will 
            vary for each enzyme
O_txt[][]   char array to hold each 'block' (one line of sequence w/ associated REs, #s, translations, etc) 
            as it's being prepped for output
ok2wr[]     int array to hold the position markers that indicate where in O_txt it's 'OK to write' an RE name
            so that it doesn't overwrite a previous one
okline      index to ok2wr that starts at (O_SEQ_LINE-2) and decrements to keep track of where the next 
            line and position is that's ok to write to.  
block_Cnt   keeps track of how many blocks have been output to calculate where we are in the sequenc
base_real_pos  real world coordinates of where we are in the sequence
block_repeat   do the 'compose block, print block' routine this many times
block_cut_pos  base_real_pos modified by the offset of where the RE actually cuts
SHKey       Sequence Hash Key, the hash key generated from the previous sequence hexamer by the 
            'shortcut' method, mentioned below
lead        the leading value in the shortcut method 'lead a g t g t'
lag         the lagging value in the shortcut method  'a g t g t lag'
eoseq       pointer to the end of the alloc for the sequence
*GREs       pointer to array for the indices to the 'Good REs' vs the doppelgangers (2nd half of the nonpals
NumGREs     the number of Good REs (size of GREs above)
Gi          = GREs[i] for clarity - "Good RE Index"
topo        topology - 0 if the DNA is circular, 1 if linear - makes sense, no?
BOS         Beginning Of Sequence - marker for BOS changes depending on topo
EOS         End Of Sequence -             "
*   The identifier string for the program that gets passed to the udp function sayiamhere()  
*RebaseFile    holds the name of the alternative REBASE file name, if specified on the command line
*/ 


/* Have to take a careful look at what "int" and "long" actually are: if int is really 4 bytes, then 'int 
ought to be long enough for most things defines +/- 2 billion */
/* Declarations */
   int i, j, k, mm, m, dpl, dgn_sits[256], *chk_sits[4096], lead, lag, SHKey, n_letters=0,
      hashtable[4096], seq_hashtable [4096], NumREs, okline, codon_Table=0, OLap,
      min_okline, tic_line, Degen, Cur_RE, block_Cnt, block_repeat, gel=0,
      max_okline, block_cut_pos, ok2wr[MAX_OUTPUT_LINES], itemp1=0, itemp2=0, rsl,
      *GREs, NumGREs, Gi, p_width, topo, abcp,  basesPerLine, reps, in_OL=-1, seq_offset;

   long  D_D_Siz = 10000, l, li, lj, *Dig_Dat, *Dig_Sits[MAX_NUM_RES+10], *Frag_Siz[MAX_NUM_RES+10], D_D_Cnt, 
         D_D_Cnt_reset=0, Base_cutsite, BOS, EOS, seq_len,  tot_seq_Cnt = 1,  Xn, base_real_pos; 

   float fRE_mag;

   char ctemp1[30], RE_rawsite[32], *sequence, s[256], RE_hex[6], ct, O_txt[MAX_OUTPUT_LINES][O_LMARG+O_RMARG+MAX_BASES_PER_LINE], 
        Prot_out[MAX_BASES_PER_LINE], Codons[MAX_ORGS][N_CODONS][4], Codon_Labels[MAX_ORGS][20], progname[200], *RebaseFile, 
        LadFile[100], GelFile[100];
   extern char *optarg;
   extern int optind;

   struct RE_struct RE[MAX_NUM_RES+10];
   /*                          0   1   2   3       4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22 
char flag_letter[NFLAGS]  =  {'f','n','o','m',    'M','b','e','g','l','t','T','O','v','h','?','c','r','R','C','F','L','s','w'}; */
long flag_value[2][NFLAGS] ={{ 1,  4,  1,  1,  32000,  1,  0,  0,  0,  1,  0, -1, -1, -1, -1,  1, -2, -2,  0, -1,  0,  0, 60},
                             { 1,  4,  1,  1,  32000,  1,  0,  0,  0,  1,  0, -1, -1, -1, -1,  1, -2, -2,  0, -1,  0,  0, 60}};

/* Initialize whatever vars are needed */
   NumREs=10;  /* start at 10 for admin purposes */

/* Get mem for Dig_Dat */
   Dig_Dat = (long *) calloc (D_D_Siz, sizeof(long)); /* init the pointer to 10,000 */
   if (Dig_Dat== NULL)  {
        fprintf(stderr, "Boom!! calloc failed on initial call to get space for the Dig_Dat mem!\n");
      exit (1);
   }

/* Zero the hashtable and sum array */
   memset(hashtable, 0 ,4096);  
   memset(seq_hashtable, 0 ,4096);  

/* Figure out what the program should do, either by parsing the commandline options or by asking the user */   
/* if the program is invoked by name alone, tell the user how to use it, but still allow it to do useful work */
   if (argc < 2 ) fprintf(stderr, "type 'tacg -h' for more help on program use. \n"); 

/* Now get and process the flags and return the alt REBASE file, if any */
   RebaseFile = SetFlags (argc, argv, flag_value /*, LadFile, GelFile */);
   if (flag_value[0][17] != 1)  RebaseFile = SearchPaths("rebase.data", "REBASE");

/* Open and process the "codon.prefs" file via function Read_Codon_Prefs ()  */
    Read_Codon_Prefs (Codons, Codon_Labels);

/* Get the sequence from stdin, format it into 'sequence', bracket it with overlaps to allow 
   circular cutting and return it's actual length */
   sequence = GetSequence(&tot_seq_Cnt, flag_value, &seq_len);

/* set up more vars, based on flags - could use only array to pass vars, but this is more readable */
   topo = flag_value[0][0]; 
   basesPerLine = flag_value[0][22];
   p_width = O_LMARG + O_RMARG + basesPerLine;
   memset(O_txt,' ',((O_LMARG+O_RMARG+MAX_BASES_PER_LINE)*MAX_OUTPUT_LINES));  /* set all of O_txt to blanks - works!*/
   if (flag_value[0][7] != 0) gel = 1;
/* Decide from flag_value how to handle translation options ... */
   if (flag_value[0][9] != 0)  {Xn = flag_value[0][9]; n_letters = 1; } 
   else {Xn = flag_value[0][10]; n_letters = 3; }

   max_okline =  O_SEQ_LINE + 4 + abs(Xn); /* how many more lines of output are required for the translation */
   codon_Table = flag_value[0][18]; /* which Codon Table to use in Translate() */

/*********************************************************************************************************************
*   Start of RE input and processing - functionized for Ver 2																		   *
**********************************************************************************************************************/

/* open the REbase input file, specified either by flag or by setting above */
/*   fprintf(stderr,"Going to try to read \"%s\"(REbase input file)!!\n", RebaseFile);  */
   if ((fpinRE=fopen(RebaseFile,"r")) == NULL) {  /* if it's not there and readable */
      fprintf(stderr,"Cannot open the REbase file \"%s\" for reading!!\n", RebaseFile); 
      exit(1); /* print an error and die gracefully */
   }

/*  Scan thru comments in rebase file to separator ("..")   */
   while ((feof(fpinRE) == 0) && (strncmp(ctemp1, "..",2)!=0))  { 
      fscanf (fpinRE, "%2s", ctemp1); ct = 'm'; 
      while (ct != '\n') ct = fgetc(fpinRE); /* and read to end of line */
   }

/* we're at "..", so go into file and start *directly* slurping data into struct
   except for RE_rawsite which has to be filtered a couple ways before being acceptable */
   while (((fscanf(fpinRE,"%s %d %s %d",ctemp1,&itemp1,RE_rawsite,&itemp2)) != EOF) && (NumREs<MAX_NUM_RES)) { 
      /* remember that scanf vars have to be dereferenced (&var) unless pointers or arrays */
      ct = 'm'; while (ct != '\n') ct = fgetc(fpinRE); /* and read to end of line */      
      /* fprintf(stderr, "Candidate RE raw sites = %d %s\n", NumREs, RE_rawsite); */

      if (ctemp1[0] != ';'){  /* if the enz hasn't been commented out */
         /* The following bit filters the entire recognition site from rebase.data to the REstruct.  Difference 
         from below is that it does it for the whole sequence, not just the 1st 6 bases for the hashtable */
         /* grab the raw site for labelling purposes */
         /* fprintf(stderr, "Verified NumREs = %d %s\n", NumREs, ctemp1);  */
                        rsl = strlen(RE_rawsite)+1; /* rsl = Raw Site Length - used because of a perceived bug in GNU strcpy() */
         RE[NumREs].E_raw_sit = (char *) calloc(rsl, sizeof(char)); /* 1st grab some space */
         if (RE[NumREs].E_raw_sit == NULL) {
            fprintf(stderr, "Boom!! calloc failed at getting RE[NumREs].E_raw_sit mem!\n");
            exit (1);
         }
         /* fprintf(stderr, "struct=%s, src=%s\n",*/
         strcpy (RE[NumREs].E_raw_sit, RE_rawsite); /* , RE_rawsite); */   /* then copy it to RE_rawsite */
                        /* fprintf(stderr, "Missing? %s = %s \n",RE[EMPTY].E_nam, RE[EMPTY].E_raw_sit); */
         /* Stanza to calc overhangs for matching to filter-by-overhang */         
         OLap = 1;	  /* reset the default to take all the REs, regardless of overhang */
         /* calc overlap value for exclusion clause below */
         if (flag_value[0][2] != 1) {    /* if we want to restrict selection by overhang */
                if (itemp2 > 0) OLap = 5;			   /* have to calc it */
            else {
                if (itemp2 < 0) OLap = 3;
                else OLap = 0;
            }
        }

        i=j=0;
         while (RE_rawsite[i] != '\000') {  /* while not at end of string  */
            if (RE_rawsite[i] != '_' && RE_rawsite[i] != '\'') { /*As long as it's a valid char */
               RE_rawsite[j++] = tolower(RE_rawsite[i++]); /* The struct value gets the whole site */
            } 
            else i++;  /* incr i only, skipping the _ and ' chars  */
         } 

         /* Here's where the 'nnn' detection/handling goes */
         while(RE_rawsite[--j]=='n');  /* back up until hit a non 'n' */
         RE_rawsite[++j] = '\000';     /* then restore the end of string mark */
          /* Now calculate E_mag exactly to see if it should be included in the digestion */
         fRE_mag = 0; /* float counter for magnitude */
         for (i=0;i<j;i++){ /*for each base in the site ...*/
            switch (RE_rawsite[i])  { 
               default: fprintf (stderr, "Strange character in E_mag calc'n: %c\n", RE_rawsite[i]); break;
               case 'a': case 'c': case 'g': case 't': fRE_mag += 1; break;
               case 'y': case 'r': case 'm': case 'k': case 'w': case 's': fRE_mag += 0.5; break;
               case 'b': case 'd': case 'h': case 'v': fRE_mag += 0.25; break;
               case 'n':  break; /* 'n' doesn't cause the mag to be increased at all */
            }
         }

         if ((OLap == flag_value [0][2]) && ((int) fRE_mag >= flag_value[0][1])) { /* don't bother loading any REs that don't match this crit */
                RE[NumREs].E_mag = (int) fRE_mag; /* this truncates fRE_mag, but since it's upwardly inclusive ... */
            strncpy(RE[NumREs].E_wsit, RE_rawsite, j); /* E_wsit = the recog site, stripped of _ and ' and extra n's */
            RE[NumREs].E_len = j;  /* set the real recog length since we calculated it */
            RE[NumREs].E_pal = palindrome (RE[NumREs].E_wsit, j); /* and determine if the whole site is a pal */
            /* Filter the raw site from rebase.data to the hexamer val in struct
               that's submitted for the numeric conversion */

            i=j=0;
            while (j<6) {  
               if (RE_rawsite[i] == '\000') { /* if the site is shorter than a hexamer */
                   while (j<6) RE_hex[j++] = 'n'; /* pad out the site with n's */
               } else RE_hex[j++] = RE_rawsite[i++]; /* else copy one to the other */
            } 
            /* And assign the rest of the temps to the struct vars */

            for (i=0;i<6;i++) RE[NumREs].E_hex[i] = RE_hex[i];
            RE[NumREs].E_nam_l = strlen(ctemp1); /* get the length of the RE name and pop it in */
            strcpy (RE[NumREs].E_nam,ctemp1);    /* and pop the RE name in too */

            RE[NumREs].E_tcut = itemp1;
            RE[NumREs].E_olap = itemp2; /*But DON'T incr the NumREs counter yet! */

         /* Now do the ugly handling of nonpalindromes - this handles non-pal's as 2 enzymes with 
            the same names - it creates another entry in RE[] and copies over all identical data.  */
            if(RE[NumREs].E_pal == 0)  {  /* if the RE is not a pal  */
               dpl = NumREs+1;      /* incr the pointer to the next struct entry    */
               /* reverse complement the structure */
               Anti_Par(RE[NumREs].E_wsit, RE[dpl].E_wsit, RE[NumREs].E_len);
               for (j=0; j<RE[NumREs].E_len; j++) RE[dpl].E_hex[j] = RE[dpl].E_wsit[j];
               if (RE[NumREs].E_len<6) for (j=RE[NumREs].E_len;j<6;j++) RE[dpl].E_hex[j] = 'n';

               /* and copy over the rest of the values   */
               strcpy (RE[dpl].E_nam, RE[NumREs].E_nam);

                                        /*rsl = strlen(RE[NumREs].E_raw_sit); */
               RE[dpl].E_raw_sit = (char *) calloc(rsl, sizeof(char));
               if (RE[dpl].E_raw_sit == NULL) {   /* why does malloc work and calloc not work? */
                  fprintf(stderr, "Boom!! malloc failed at getting RE[dpls].E_raw_sit mem!\n");
                  exit (1);
               }
               strncpy (RE[dpl].E_raw_sit, RE[NumREs].E_raw_sit,rsl); /* used because of perceived bug in Linux/GNU strcpy() */

               RE[dpl].E_len  =  RE[NumREs].E_len;
               RE[dpl].E_pal     =  -1; /* so it's id'ed as the doppel and can point back to 1st */
               RE[dpl].E_dgen    =  RE[NumREs].E_dgen;
               RE[dpl].E_mag     =  RE[NumREs].E_mag; 
               RE[dpl].E_nam_l   =  RE[NumREs].E_nam_l;
               /* topcut requires a little jig */
               RE[dpl].E_tcut    =  RE[NumREs].E_len - RE[NumREs].E_tcut;
               RE[dpl].E_olap    =  RE[NumREs].E_olap;
               NumREs++;   /* bring NumREs up to dpl  */
           }  /* end nonpalindrome handling */ 
           NumREs++;   /* and incr it to point it to the next struct array entry */
         }  /* end of magnitude check */
      } /* end of if statement that checks for ';' at beginning of RE name */
   } /* End of while loop that reads in  RE enzyme data into struct */

/* Now need to cycle thru the struct, generating the hashtable and chk_sits arrays */
   for (m=10;m<NumREs;m++) {
      /* Don't forget to reset the variables that need it! */
      RE[m].E_dgen = hash(RE[m].E_hex, dgn_sits, 6); /* call to hash !!! */
      /* And finally increment the hashtable to reflect the degeneracy  */
      for (i=0; i<RE[m].E_dgen; i++) {
         hashtable[dgn_sits[i]]++;  /* incr the corresponding element of the array */
      }  
   }   

/* Need to calloc (alloc and zero) the space that chk_sits will use */
   for (i=0;i<4096;i++){
      if (hashtable[i]>0){
         chk_sits [i] = (int *) calloc ((hashtable[i]+1), sizeof(int)); 
         if (chk_sits [i] == NULL) {   
            fprintf (stderr, "calloc failed on getting mem for chk_sits!\n");
            exit (1);
         }
         chk_sits[i][0] = 1; /*   the [0] is used as a pointer to the next usable index */
      }                      /* so it has to be incr to point to 1 */
   } 

/* And finally rehash the enz list, filling out the chk_sits array with the index of the RE struct 
   that has to be checked in the event that the sequence hashes to the chk_sits index - ugly to do 
   something twice like this, but I can't think of a way around it yet*/

/* TBD: Well, now I can - assign a large amount of memory for the chk_sits arrays initially, realloc more mem 
   if they run out, then run thru them afterwards, realloc'ing the mem to the right size.  Question is, 
   is it going to make the code any faster or more efficient?  Low priority now... */

   for (i=10;i<NumREs;i++){
      Degen = hash(RE[i].E_hex, dgn_sits, 6);
      /* and now use dgn_sits to to update chk_sits */
      /* below means that at each step, the counter at chk_sits [][0] is incremented and
         is used to point to the position of the next entry (initialized to 1, above).  it's a
         bit arcane in that the indices are self-referential  */
      for (j=0;j<Degen;j++) chk_sits[dgn_sits[j]]  [chk_sits[dgn_sits[j]] [0]++] = i; 
   }

/************************************************
*   End of RE input and processing section      *
************************************************/
        /* for (i=10;i<NumREs;i++) fprintf(stderr, "Raw Site #%d (%s) = %s\n",i, RE[i].E_nam, RE[i].E_raw_sit); */


/* Cutting the Sequence, using SHKey/shortcut approach..... The shortcut is to hash the sequence hexamer only 
once and then use the calculated values of the hexamer to assist in calculating the next hash key.  Since 5 of 
the 6 characters have already been 'hashed' for the next key, why recalculate them and suffer the overhead of 
a function call, when you can (with just a little extra code) get the same value on the fly, inline....
It appears to be about 5-7x times faster for long sequences (where the cost is in the rehashing, not the base
cost of hashing the RE data twice */ 

/* Below sets up the margins for running thru the seq - if it's circular (topo=0), have to include the repeated
   seqs on the beginning and end.  if topo=1, then just have to run thru the real seq (minus the repeats at begin 
   end). In both cases, have to go thru Dig_Dat and 'normalize' the position info to the actual sequence coordinates.
   by subtracting BASE_OVERLAP from the index as written - altho this could be done on the fly without too much effort */

/* Because of the way SHKey is calculated, sequence indices should start on a '1' boundary, rather than a '0' boundary
   so that a 1 can be subtracted without catastrophe  - also makes it easier to understand where the sequence actually is */

   if (topo == 0)  {          /* if the sequence is circular */
      BOS = 1;                /* Beginning Of Sequence starts at the very beginning of 'sequence' */
      EOS = tot_seq_Cnt;      /* and End Of Sequence ends at very end  */
   } else {                   /* but if it's linear */
      BOS = BASE_OVERLAP + 1; /* BOS starts after the beginning buffer */
      EOS = tot_seq_Cnt - BASE_OVERLAP;   /* and EOS is before the ending buffer */
   }

   D_D_Cnt = 0;  
   /* RE_hex[0] = 'a'; */  /* 1st base of seed hexamer can be anything - it's discarded  */
   for (i=0; i<6; i++) RE_hex[i] = sequence[BOS-1+i]; /* then 5 more of the real seq, since it's discarded anyway */
   Degen = hash(RE_hex, dgn_sits, 6);  /* calculate the initial hash value */
   SHKey = dgn_sits[0]; /* SHKey = 'Sequence Hash Key'   */

   for (li=BOS; li<EOS-4; li++) {   /* do the whole seq incr 1 nuc at a time */
      lj = li+5;  /* lj points to the 'lag'ging base */
      /* calculate 'lead' value */
      switch (sequence[li-1]) {     /* a=0, c=1, g=2, t=3,  */
         case 'a': lead = 0;  break;   
         case 'c': lead = 1;  break;   /* lead values are numeric value * 1 */
         case 'g': lead = 2;  break;
         case 't': lead = 3;  break;
         default:  break;  /* bad character detection - but it's already been done */
      }  
      /* calculate 'lag' value */
      switch (sequence[lj]) {    /* a=0, c=1, g=2, t=3,  */
         case 'a': lag = 0;  break;   /* lag values are numeric value * 1024 */
         case 'c': lag = 1024;  break;   
         case 'g': lag = 2048;  break;
         case 't': lag = 3072;  break;
         default:  break;  /* bad character detection */
      }  
      /* instead of calling 'hash' on each new hexamer, can calc it incrementally as below */
      SHKey = ((SHKey-lead) >> 2) + lag;  /* bit shift version; shouldn't need the int cast - will it work?  yes */

      /* now check the generated key against the hashtable to see if there's an entry */
      if (hashtable [SHKey] != 0) { /* no degeneracy so only dgn_sits[0] (= SHKey) will matter */
          for (m=1; m<chk_sits [SHKey][0]; m++) {
             Cur_RE = chk_sits[SHKey][m]; /* For clarity's sake!! */
            /* Also writes REAL WORLD coords to Dig_Dat, so Frag_Siz will be easier, but pretty printing has to change */
            /* Large Majority of recog sites are <= 6 base pairs so the last part of the following test should be 
               executed only very rarely */
            if (( RE[Cur_RE].E_len < 7) ||  (li + RE[Cur_RE].E_len <= EOS &&
                  Degen_Cmp (RE[Cur_RE].E_wsit, (sequence+li),RE[Cur_RE].E_len) == 1 )) {
            /* mm = real world coordinates; has to be calculated differently for circ and linear */
               mm = li-BASE_OVERLAP+RE[Cur_RE].E_tcut;  /* temp var to save adds below */
               if (mm > 0 && mm < seq_len)  {   /* if it's within the sequence */
                  if (RE[Cur_RE].E_pal != -1){  /* if the Cur_RE  is a pal or the 1st half of a nonpal */
                     RE[Cur_RE].E_Ncuts++;      /* handle normally */
                     Dig_Dat[D_D_Cnt++] = mm;   /* now should be set to real world coordinates */
                     Dig_Dat[D_D_Cnt++] = Cur_RE;
                  } else {    /* but if it's the 2nd half of a nonpal */
                     RE[Cur_RE-1].E_Ncuts++;    /* give all credit to its 1st half (Cur_RE-1) */
                     Dig_Dat[D_D_Cnt++] = mm;   /* now should be set to real world coordinates */
                     Dig_Dat[D_D_Cnt++] = (Cur_RE-1) * -1;   /* ditto but mark it by negation */
                  }
               }
            }
         }
         if (D_D_Siz - D_D_Cnt <25) {  /* Now check to see if we need more more memory */
            D_D_Siz = D_D_Siz+5000;
            Dig_Dat = (long *) realloc (Dig_Dat, sizeof(long)*D_D_Siz); /* realloc the size 5000 bigger */
            if (Dig_Dat== NULL) {
               fprintf(stderr, "Boom!! realloc failed on call to get space " 
               "for the Dig_Dat mem at D_D_Siz = %ld!\n", D_D_Siz);
               exit (1);
            }
         }
      }  /* end of "if (hashtable [SHKey] != 0)    "... */
   }  /* end of "for (i=0; i<tot_seq_Cnt-6; i++) "... (end of the actual cutting routine) */

   Dig_Dat[D_D_Cnt] = -2; /* mark the end of Dig_Dat so we know where to end later on */
/*  fprintf(stderr, "size of D_D_Cnt = %d\n", D_D_Cnt); */ /* and tell us how big it is, just because */

   /* Now, filter the pointers to the good REs (GREs[]) from the doppelgangers */
   GREs = (int *) calloc (NumREs, sizeof (int));
   if ( GREs== NULL) { /* error checking */
      fprintf(stderr, "Boom!! calloc failed on call to get space for the GREs mem!\n");
      exit (1);
   }
   for (i=10,NumGREs=0;i<NumREs;i++) {  /* go thru RE, transferring the index of all the good (non doppel) REs */
      if (RE[i].E_pal != -1) GREs[NumGREs++] = i; /* if it's not a doppel, note it in GREs */
   }

/* translate the data from Dig_Dat to Dig_Sits for calc'ing fragment sizes */
   /* This procedure skips doppels completely by using the GREs[] as a pointer to what should be counted
   in the assignment - uses an additional level of indirection but skips all kinds of messy housekeeping
   later in the program - will generally use a shorter var 'Gi' to make the 
   line shorter */ 
   /* calloc the space for *Dig_Sits[] and sister array *Frag_Siz[] and init the 0th el to point to 1st free spot */
      for (i=0;i<NumGREs;i++) { 
         Gi = GREs[i]; /* for clarity - "Good RE Index" */
         Dig_Sits[Gi]=(long *) calloc (RE[Gi].E_Ncuts+2, sizeof(long));
         Frag_Siz[Gi]=(long *) calloc (RE[Gi].E_Ncuts+2, sizeof(long)); /* need '+2' for extra frag */
         if (Dig_Sits[Gi]==NULL || Frag_Siz[Gi]==NULL)  {     /* and make sure we got the space */
            fprintf(stderr, "Boom!! calloc failed on call to get space \
            for the *Dig_Sits[] or *Frag_Siz[] mem at iteration = %d!\n", i);
            exit (1);
         } 
         Dig_Sits[Gi][0] = 1;      /* the [i][0] el points to the next free element, so init here */
      }   

/* The following loads Dig_Sits[], the array that tracks the digestion sites - *should not* have to be
   modified for *dopplegangers* (2nd half of nonpals) as Dig_Dat should only have refs to the 1st halves */
   i=0;
   while (Dig_Dat[i] != -2) { /* -2 is the end point indicator */
      Base_cutsite = Dig_Dat[i++]; /* i now points to RE# */
      mm = abs(Dig_Dat[i++]); /* shortens next line, incr's i to point to next position */ 
      Dig_Sits[mm] [Dig_Sits[mm] [0]++] = Base_cutsite;    /* weird, self-referential */
   }    /*   [RE#] [Pointer to next free el++ ]     */     /* definition, but.. it works.. */
   /* Dig_Sits now loaded with REAL WORLD Coords so calc'n of Frag_Siz should be accurate */

/* Now calc fragment sizes, sort in increasing size, now using REAL WORLD Coords so frags should be 
   exactly right size - also have to consider topology for 1st and last cut*/
   for (i=0;i<NumGREs;i++) {  /* the lower index point starts @ '0', not '10' because we're using GREs[] */
      Gi = GREs[i];
      if (RE[Gi].E_Ncuts != 0) { /* using 'l' as index -> Ncuts+1 frags in linear, counting the 0th el*/
         l = RE[Gi].E_Ncuts;  /* need a modifiable variable;  */
         if (topo == 1) {  /* handling topo diffs - if it's linear... */
            Frag_Siz[Gi][0] = Dig_Sits[Gi][1]; /* 1st frag will be magnitude of 1st cut */
            Frag_Siz[Gi][l] = seq_len - Dig_Sits[Gi][l]; /* last frag will be length of sequence - last cut */
         } else { /* otherwise it's circular and the 1st frag is the size of the above 2 combined ...*/
            Frag_Siz[Gi][0] =  Dig_Sits[Gi][1] + seq_len - Dig_Sits[Gi][l];
            Frag_Siz[Gi][l] = 0; /* and the 'last' doesn't exist (size 0) */
         }
         if (RE[Gi].E_Ncuts > 1) {
            for (j=1;j<RE[Gi].E_Ncuts;j++)   {
               Frag_Siz[Gi][j] = abs(Dig_Sits[Gi][l] - Dig_Sits[Gi][l-1]); /* frag = site nearer end - next site  */
               l--;  /* need the 'abs' because using REAL WORLD coords, it's possible to get negative values by */
            }        /* same offset cutting RE site on diff strands in close proximity */
         }
      } else Frag_Siz[Gi][0] = seq_len;  /* if it doesn't cut, the fragment is the size of sequence */
   }

/***************************************
*      Now, starting the OUPUT phase   *
***************************************/

/* Output a table of REs that do not cut... */
   reps = (int) ((O_LMARG+O_RMARG+basesPerLine)/10 -1);
   j=0; Cur_RE = -1;
   printf ("\n\n\n Restriction Enzymes that DO NOT CUT in this sequence:\n\n");
   while (Cur_RE < NumGREs) {
      if (RE[GREs[++Cur_RE]].E_Ncuts == 0) { printf ("%10s", RE[GREs[Cur_RE]].E_nam); j++; }
      if (j == reps) { printf ("\n"); j=0; }
   }

/* Now, just output a table of the number of cuts by enzyme - sort of redundant, but useful */
   reps = (int) ((O_LMARG+O_RMARG+basesPerLine)/15);  m = (int) (NumGREs/reps)+1;    
   printf ("\n\n\n Total Number of Cuts per Restriction Enzyme:\n\n");
   for (i=0;i<m;i++) {
      for (j=0;j<5,i+(j*m)<NumGREs;j++) printf ("%10s%5d", RE[GREs[i+(j*m)]].E_nam, RE[GREs[i+(j*m)]].E_Ncuts);
      printf ("\n");
   }

   reps = (int) ((O_LMARG+O_RMARG+basesPerLine)/7); /* needed for both PrintSites() and PrintFrags() */

/* And if wanted (-s = 1), print out all the cut sites, filtered as below in PrintFrags() */
   if (flag_value[0][21] == 1) PrintSitesFrags(RE, NumGREs, reps, GREs, Dig_Sits, flag_value, 1);

/* Now if wanted, print out the UNsorted fragments generated by the cuts, listed across the page. 
This routine also uses a very large number of printf calls to generate the output format and 
might be better written to 'compose and dump' entire lines rather than doing it piecewise 
as it does here.  'Compose and dump' is the way I'm trying to write the long text output
but it's taking time as I've never done this thing before - advice is welcome. */

/* Might also sort by number of Fragments as Strider does, but not yet. */
   mm = (int)flag_value[0][19];
   if (mm==1 || mm==3) { /* if want *unsorted* or *both sorted and unsorted * frags */
        printf("\n\n\n  **  UNSORTED "); /* prefix to PrintFrag() title */
        PrintSitesFrags(RE, NumGREs, reps, GREs, Frag_Siz, flag_value, 0);
   }
   if (mm==2 || mm==3 || gel) { /* if want *sorted* or if need sorted for gel */
      for (i=0;i<NumREs;i++) {  /* the lower index point starts @ '0', not '10' because we're using GREs[] */
         Gi = GREs[i]; /* for clarity */
         if (topo == 0) j = RE[Gi].E_Ncuts; /* j is used to pass the number of els to qsort */
         else j = RE[Gi].E_Ncuts+1; /* if topo == 1, then there's an extra fragment */
         qsort(Frag_Siz[Gi], j, sizeof(long), compare); /* 2X as fast as braindead sort!) */
      } /* Now they're sorted! Just use PrintFrags again to dump them out */

      if (gel) PrintGelLadderMap(RE, NumGREs, seq_len, GREs, Frag_Siz, flag_value, gel); 
      if (mm==2 || mm==3){
         printf("\n\n\n  **  SORTED "); /* prefix to PrintFrag() title */

        PrintSitesFrags(RE, NumGREs, reps, GREs, Frag_Siz, flag_value, 0);
      }
   }

   if (flag_value[0][8] == 1) {  /* if you want a text-ladder map of the sequence */
      PrintGelLadderMap(RE, NumGREs, seq_len, GREs, Dig_Sits, flag_value, 0);  
   }

/* if you want a linear map of the sequence */
   if (flag_value[0][20] == 1) {  
      /* Routine to print out the text of the digestion, with names going where they should go
      a la Strider - have to keep track of how long the names are and where an allowable place 
      to write the name - */

      /* In brief, the following procedure (which should probably be turned into a function), does this: 
      It reads thru Dig_Dat (the array that keeps track of the linear sequence of cuts in the sequence) 
      and matches the position of the enzymes to the current block being printed, taking into account 
      the cut offset of the RE from the 1st base of the recognition sequence.  The tricky part is that 
      because of this latter functionality, it has to check the sequence that will be the *following* 
      block to see if there are any RE recognition seqs within the 1st few bases that will cause the actual 
      cut site to be in the *previous* block.  Doing this seems to take a great amount of the cpu time - can 
      probably be optimized a good deal more, but it's still >10 faster than GCG ;). */

      fprintf(stdout, "\n\n\nRestriction Map of Sequence \n\n"); /* Write the header */ 
      /* inits, etc */ 
      if (seq_len % basesPerLine == 0)  block_repeat = (int) seq_len/basesPerLine; /*  # times thru the compose/print cycle */
      else  block_repeat = (int) seq_len/basesPerLine + 1;  
      D_D_Cnt=0;  /* start the Dig_Dat counter at the beginning */
      base_real_pos = Dig_Dat[D_D_Cnt];

      for (block_Cnt=1; block_Cnt <= block_repeat; block_Cnt++) { 
      /*  fprintf(stdout, "block_Cnt = %d, block_repeat = %d\n", block_Cnt, block_repeat);   */ 
         /* if there was an OVERLAP failure previously, fix it */ 
         if (in_OL == 1) D_D_Cnt = D_D_Cnt_reset;  /* and reset D_D_Cnt to that pointer */ 
         in_OL = -1;  /* and reset 'in OVERLAP' marker for new block */
         base_real_pos = Dig_Dat[D_D_Cnt];  /* Make sure that ' base_real_pos' is set */
         okline = min_okline = O_SEQ_LINE-2; /* set the min (highest on page) line of buffer to print out */

         /* Set up the seq and its rev compl in the ouput buffer */
         memset (ok2wr,0,sizeof (int)*MAX_OUTPUT_LINES);  /* set/reset all of ok2wr to 0's for the new block*/
         /*write the correct numbering in Left margin  */
         sprintf(s, "%7d",(((block_Cnt-1)*basesPerLine) + 1)); /* these 2 lines could be combined...? */
         memcpy (&O_txt[O_SEQ_LINE][0], &s, 7);
         /* and in the right margin - combine with the above bit? */
         sprintf(s, "%7d",(((block_Cnt)*basesPerLine)));
         memcpy (&O_txt[O_SEQ_LINE][O_LMARG+basesPerLine], &s, 7);
         /* and drop in the sequence directly after it. */

         if (block_Cnt != block_repeat) k = basesPerLine; /* test for end of sequence to end gracefully */ 
         else k = seq_len -1 - (block_Cnt-1)*basesPerLine;
                                 /* fprintf(stderr, "flag_value[0][5] = %ld flag_value[0][6] = %ld \n", flag_value[0][5], flag_value[0][6]); */
         if (flag_value[0][5] != 1) { /* if the sequence is a subsequence, print the #s of the original seq too */
                 /*write the correct numbering in Left margin  */
                 sprintf(s, "%7d",(((block_Cnt-1)*basesPerLine)+flag_value[0][5])); /* these 2 lines could be combined...? */
                 memcpy (&O_txt[O_SEQ_LINE+1][0], &s, 7);
                 /* and in the right margin - combine with the above bit? */
                 sprintf(s, "%7d",(((block_Cnt)*basesPerLine - 1 + flag_value[0][5])));
                 memcpy (&O_txt[O_SEQ_LINE+1][O_LMARG+basesPerLine], &s, 7);
         }

         memcpy (&O_txt[O_SEQ_LINE][O_LMARG], &sequence[(block_Cnt-1)*basesPerLine+BASE_OVERLAP+1], k); 
         /* and need to use the "s" intermediate here because Rev_Compl doesn't return the pointer to 
         the converted string, although it probably should... */
         Rev_Compl(sequence+BASE_OVERLAP+1+((block_Cnt-1) * basesPerLine), s, k);
         /* and then plunk the converted sequence into O_txt with a similar statement */
         memcpy (&O_txt[O_SEQ_LINE+1][10], &s, k /*basesPerLine*/ );

         /* write out the minor and major tics below the reverse complement */
         tic_line = O_SEQ_LINE+2;
         for (i=0;i<(int)(basesPerLine/10);i++) memcpy (&O_txt[tic_line][i*10+O_LMARG],"    ^    *",10);

         /* Following call to Translate has to calculate the # of bases at each call; otherwise end up with 
            overrun condition at end; "k" in code above */
         if (n_letters != 0) {
            for (mm=0;(mm<Xn && mm<3); mm++) {
               if (k % 3 != 0) k = ((int) k /3)*3;   /* also added this code to Translate() for better modularity */
               seq_offset = ((block_Cnt-1)*basesPerLine+BASE_OVERLAP+1);
               Translate (sequence+seq_offset+mm, Prot_out, k, n_letters, Codons, codon_Table); 
               memcpy(&O_txt[O_SEQ_LINE+3+mm][O_LMARG+mm], &Prot_out, k); /* and copy it into place */
               if (Xn == 6) { /* but if it's 6 frames of Xlation, need to do oppo at the same time */
                  Anti_Par (sequence+seq_offset-mm, s, k); /* get the *antipar* sequence */
                  Translate (s, Prot_out, k, n_letters, Codons, codon_Table); /* Xlate it into N-letter code */
                  if (n_letters == 1) Reverse (Prot_out); /* to match the 'backwards' DNA strand */
                  else Triplet_Reverse (Prot_out);
                  memcpy(&O_txt[O_SEQ_LINE+6+mm][O_LMARG-mm], &Prot_out, k); /* and copy it into place */
               }
            }
         }

         /* This 'while' loop tests each text block for REs and also prints out those blocks that have no 
         cuts in then at all but which have to be printed anyway....*/
         while ((Dig_Dat[D_D_Cnt] != -2) && (base_real_pos < ((block_Cnt * basesPerLine) + BASE_OVERLAP))) {
            D_D_Cnt++; 
            /* if we're not at the end and the real pos'n <  current block + the overlap.. */
            /* !! Now Dig_Dat[D_D_Cnt] should point to THE corresponding  RE #s !! */
            okline = O_SEQ_LINE-2;
            /* Check if the RE is a pal (# >0) or not (# <0) */
            if (Dig_Dat[D_D_Cnt] < -9) {   /* If the entry < -9, it's the 1st part of a non-palindrome */
               Cur_RE = abs(Dig_Dat[D_D_Cnt])+1;  /* so to get the right coords for it, 
               the matching doppelganger entry has to be referenced (add 1 to the abs value*/
            }      /* also re-using 'Cur_RE' below (in context) - hope it doesn't break anything... */
            else  Cur_RE = Dig_Dat[D_D_Cnt];
                /*dump some debugging info...*/
         /*   fprintf (stdout, "D_D_Cnt=%d; %s (%d) = %s Pos'n/Enz=%d/%d\n", D_D_Cnt, RE[Cur_RE].E_nam, Cur_RE, RE[Cur_RE].E_wsit, Dig_Dat[D_D_Cnt-1],Dig_Dat[D_D_Cnt] ); */ 
         /*   block_cut_pos =  base_real_pos - 2 - ((block_Cnt-1) * basesPerLine) + RE[Cur_RE].E_tcut; */
            block_cut_pos =  base_real_pos - 1 - ((block_Cnt-1) * basesPerLine); /* see if it's this easy?! */

            /* Now locate it in the block, checking for under- and over-runs */
            if (block_cut_pos >= 0) { /* if the position is greater than the beginning of the block */
               if (block_cut_pos <= basesPerLine) { /* and if it's less than the end of the block... */
                  abcp = O_LMARG + block_cut_pos - 1; /* abcp = adjusted block_cut_pos - used x times below */
                  O_txt[O_SEQ_LINE-1][abcp] = '\\';   /* write the 'cut' indicator */
                  /* locate the position for writing the name */
                  while (block_cut_pos < ok2wr[okline]) okline--; /* then go up to the lowest 'free' line */
                  if (okline != O_SEQ_LINE-2)  { /* but if can't place the name on the lowest line.. */
                     i = O_SEQ_LINE-1; k = 0;   /* check if there's any space lower between previous names */
                     /* following 'while' must be ordered like it is - i will be decr only if k == 0 */
                     while (k == 0 && --i != okline) { /* leaving 1 space before and after the name */
                        if (O_txt[i][abcp-1] == ' ' &&                /* this can be replaced with something */
                            O_txt[i][abcp+RE[Cur_RE].E_nam_l] == ' ' &&  /* more efficient later */
                            O_txt[i][abcp+3] == ' ' &&       /* must check for a space here... */               
                            O_txt[i][abcp+4] == ' ' )  k = 1;  /* as well as here */
                          /* i = vertical counter ~= okline, k = 1 when we find enuf space */
                     }
                     okline = i;
                  }
                  /* and memcpy the RE name from the struct to the output array */
                  memcpy (&O_txt[okline][abcp],&RE[Cur_RE].E_nam, RE[Cur_RE].E_nam_l);
                  /* and incr the ok2wr posn for that line by the length of the RE name + 1 */
                  /* 'ok2wr[okline]' below is modified only if we didn't find any lower spaces */
                  if (block_cut_pos+RE[Cur_RE].E_nam_l+1 > ok2wr[okline]) {
                     ok2wr[okline] = block_cut_pos+RE[Cur_RE].E_nam_l + 1; 
                  }
               } /* if (block_cut_pos < basesPerLine).. */
               else {  /* otherwise it's in the overlap area from the next block */
                  if (in_OL == -1) {   /* if this is the 1st time in OVERLAP, mark it */
                     in_OL = 1;        /* by setting in_OL to 1  */
                     D_D_Cnt_reset = D_D_Cnt - 1;  /* Have to back up to 1st RE in the OVERLAP that did not 
                                                      resolve into the current block */
                  }   /* The above marker *should* back the D_D_Cnt to the position of the Cur_RE */ 
               }
            }     /* if (block_cut_pos >= 0) ... */

            base_real_pos = Dig_Dat[++D_D_Cnt]; /* make sure that this is set before the next loop */
            if (okline < min_okline) min_okline = okline;
         }  /*  while (Dig_Dat[..... */

         /* Now print out the block we've composed, from min_okline to MAX_OUTPUT_LINES */
         for (i=min_okline; i<max_okline;i++) fprintf(stdout, "%.*s\n",p_width,O_txt[i]);
         fflush(stdout);  /* and fflush the fucker each time ?!??!!? - works, but why?? */

         /* And clean out the lines written into O_txt as well */
         memset(O_txt,' ',((O_LMARG + O_RMARG + MAX_BASES_PER_LINE) * MAX_OUTPUT_LINES));  /* set all of O_txt to blanks */
         /* fprintf(stdout, "Just waitingWAITING... ");    */
      }   /* for (block_Cnt=1; block_Cnt<block_repeat; block_Cnt++) ... */
      /*  fprintf(stdout, "Just waitingWAITING... ");    */
   } /*    if (flag_value[0][20] == 1) ... ie if you want a linear map ofthe sequence */


/* And (please) give Harry an electronic citation automatically, spitting all the relevant info   */
/* back to hornet - should include program version, sequence lenth (seq_len, not tot_seq_Cnt),    */
/* options via flagvalue, which cpu, OS, date, error state if definable, others in */
/* one long string via udp packets */
#if REPORT == 1
           if (flag_value[0][15]==1)  {
                   /* copy the version and command-line args into the progname var to send back via udp */
              sprintf(progname, "[TACG Version %s] ", VERSION);
              for (i=0; i<argc; i++) {
                 strcat(progname, argv[i]);
                 strcat(progname, " ");
               }
               sprintf(s, " <%ld bp>", seq_len);
               strcat(progname, s); 
           /* fprintf(stderr, "progname = %s\n", progname);*/
               i = iamhere(progname); /* if citation flag is 1 then spit the udp packets */
           }
#endif
} /* End main() */
