/* Program tacg - a command line tool for Restriction Enzyme digests of DNA  */
/* Copyright  1996 Harry J Mangalam, University of California, Irvine (mangalam@uci.edu, 714 824 4824) */

/* The use of this software (except that by Harald T. Alvestrand, which is described in 'udping.c')
   is bound by the notice that appears in the file 'tacg.h' which should accompany this file.  In the event 
   that 'tacg.h' is not bundled with this file, please contact the author.
*/

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include "tacg.h" /* contains all the defines, includes, function prototypes for both main() and functions */


/* GetSequence reads and formats the sequence from stdin and returns the length of the sequence, as well as a pointer
   to the array that holds it (*sequence) along with the bracketing repeats - needs only a few extra variables to do it
   and it cleans up main considerably. begin and end are indices to the real world coordinates of the sequence */
/* tot_seq_Cnt 	length of sequence + bracketing overlaps
	seq_length		actual number of bases read
*/
char *GetSequence(long *tot_seq_Cnt, long flag_value[2][NFLAGS], long *seq_length ) {
   long eoseq = 30000, i=0, m, j, l, seq_len, seq_Cntr=1, begin = 1, end = 1000000000, totSeqCnt;
   int  c, realloc_iter = 0;
   char *sequence, ct, cur_seq_line [256];   

   /* Get mem for sequence */
   sequence = (char *) calloc (eoseq, sizeof(char));  /* init the pointer to 30,000 */
   if (sequence == NULL) {
      fprintf(stderr, "Boom!! calloc failed on initial call to get space \
      for the sequence data!\n");
      exit (1);
   }

   /* load the sequence array, filtering as we go */
   totSeqCnt = BASE_OVERLAP + 1; /* to allow the wrapped seq to be inserted before the real seq (and to allow real seq to start at x1 */

   /* this is where the flag checking was - shouldn't have to do it here - can do it all in GetFlags() */
   begin = flag_value[0][5]; 	/* set in SetFlags() */
   if (flag_value[0][6] != 0) end = flag_value[0][6];		/* Ditto */

   if ((ct = getc(stdin)) == EOF) { /* if there's nothing coming in on stdin (like 'tacg' by itself) */
   	fprintf(stderr, "tacg requires sequence via stdin, either via '<file' or a pipe (|) from another program.\n");
   	exit(1); /* print an error and die */
   } else ungetc(ct, stdin); /* otherwise push the character back and get the rest by the usual method */
   
   while ((gets(cur_seq_line)) != NULL) { /* new - stdin-oriented; gets() strips the newline char, unlike fgets() */
      if (eoseq - totSeqCnt <256){
         sequence = (char *)realloc(sequence, sizeof(char)*(eoseq+30000));
         if (sequence == NULL) { 
            fprintf(stderr, "Boom!! realloc failed on %d call to get more space!\n", realloc_iter);
            exit (1);
         }
         eoseq = eoseq + 30000;
         realloc_iter++;
      }

      /* Now real seq will start at 30; before that is wrapped seq from the end and at the end is 
         wrapped seq from the beginning - required to handle circular seqs */
      i=0; /*  seq_Cntr = 1;   */
      while (cur_seq_line[i] != '\0') {
         ct = tolower(cur_seq_line[i++]);
         switch (ct) {
                case 'a': case 'c': case 'g': case 't':  /* if it's a valid base */
                        if (seq_Cntr++ >= begin) sequence [totSeqCnt++] = ct;   /* add it to the sequence */
                if (seq_Cntr > end) goto gotAllSeq; /* nasty but efficient way to break out of loops*/
            break;

            default:  /* bad char detect'n but if not acgt we don't count it and don't care - */
            break;
         }  /* end of switch/case statement */
      }
   }  /* at this point 'totSeqCnt' should point just past the end of the sequence */

   if (totSeqCnt == BASE_OVERLAP + 1) fprintf (stderr, "We NEED Interactive() already!!!\n");
   /* Interactive(); */

   gotAllSeq: /* fast exit point */
   seq_len = totSeqCnt-BASE_OVERLAP-1; /* value to return to main() */
   if (seq_len < 30)  {
      fprintf(stderr, "sequence read in is too short (<30 bp)\n");
      exit (1);
   }

/* The following code performs the "pad the beginning sequence with the end and the end with the beginning"
   regardless of whether it's the whole sequence or a subsequence, using the vars 'begin'  and 'end' */ 

   /* and now pad out the beginning w/ the end and the end w/ the beginning of sequence[]  */
   /* I'm using 'BASE_OVERLAP' cuz it'll change if some RE is found with a longer recog seq */
   /* l = index in  sequence' indicating absolute starting point for seq to be cut; 
      starts at BASE_OVERLAP+1 (just past the buffer) if no subsequence chosen.  If a subseq was specified, 
      it should be bumped up by the amount of 'begin', the user's idea of where the cut should start
      m = increment up to BASE_OVERLAP
      begin = beginning of the seq to be cut from the user's point of view.
      end = ending point for seq to be cut from the user's point of view.; 
      was seq_len or totSeqCnt if no subsequence chosen */

   for (l=BASE_OVERLAP+1,m=0; l <= BASE_OVERLAP*2+2,m <= BASE_OVERLAP; l++,m++)  {  
      sequence[totSeqCnt+m] = sequence[l]; /* copies seq past the end from the the beginning */
   }  /* note that in the above 'for', 'totSeqCnt' itself doesn't change */

/* j = counts down from BASE_OVERLAP to 0
   m = counts up from 0 to BASE_OVERLAP      */

   for (j=BASE_OVERLAP+1,m=0; j >= 0,m <= BASE_OVERLAP; j--,l++,m++)  {  
      sequence[m] = sequence[totSeqCnt-j]; /* copies seq to before the beginning from the end */
   }  /* note that in the above 'for', 'totSeqCnt' itself doesn't change */

/* Print out some diagnostics... */
/*   fprintf(stderr, "GetSeq: begin = %ld,  end = %ld,\n", begin, end);*/
/*   fprintf(stderr, "Total # of bases read = %ld (!= subsequence)\n", seq_len);   */
   if (flag_value[0][5] != 1 || flag_value[0][6] != 0) fprintf(stderr, "Subsequence is %ld bases.\n\n", seq_len);

   totSeqCnt += BASE_OVERLAP; /* now let totSeqCnt equal the size of the whole sequence incl bracketing buffers */

   sequence[totSeqCnt--] = '\0';  /* and term the sequence string, then decr it to point to end of the seq  */
   /* and free the mem not used by the sequence string */
   sequence = realloc(sequence, sizeof(char)*(totSeqCnt+1));
   /* sequence is padded with extra bases to allow circular cutting, but anything that depends on DNA length
      will reference the specific variable 'seq_len' */

   /* now we have the sequence in one long string, with 'BASE_OVERLAP' bp buffers at each end, so cut it as 
      overlapping hexamers, starting at sequence[0], but only record cuts in the real seq */
   *tot_seq_Cnt = totSeqCnt; /* send it back to main() ... a greenie weenie way to do it */
   *seq_length = seq_len;       /*     ditto     */
   return sequence; /* return the address of sequence ) */
}
