#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <ctype.h>
#include <regex.h>
#include <signal.h>
#include <locale.h>
#ifdef HAVE_LIBUTIL_H
#include <libutil.h>
#endif
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include "mysql.h"
#include "config.h"
#include "udm_syslog.h"
#include "indexer.h"
#include "mysqlint.h"
#include "parseurl.h"
#include "openhost.h"
#include "charset.h"
#include "parser.h"
#include "mysqlint.h"
#include "spell.h"
#include "md5.h"
#define MD5_DIGEST(data, size, digest) md5_buffer_str(data, size, digest)

#ifndef CONFIGDIR
#define CONFIGDIR "."
#endif

#define URL_SELECT_CACHE 16
#define URL_DELETE_CACHE 50

#define free_it(x)		{if(x){free(x);x=NULL;}}
#define STRNCMP(x,y)		strncmp(x,y,strlen(y))
#define STRNCASECMP(x,y)	strncasecmp(x,y,strlen(y))
#define SKIP(s,set)		while((*s)&&(strchr(set,*s)))s++;
#define SKIPN(s,set)		while((*s)&&(!strchr(set,*s)))s++;

#define HTML_UNK 0
#define HTML_TAG 1
#define HTML_TXT 2
#define HTML_COM 3

char *localcharset= NULL;
extern int loglevel;

#ifdef HAVE_SYSLOG_H
int log_facility=LOG_FACILITY;
#else
int log_facility=0;
#endif

char qbuf[STRSIZ*4];

int wcur,wsize;
WORD *Word=NULL;

int nservers=0;
int mservers=0;
SERVER *Server=NULL;

int nrobots=0;
ROBOT *Robots=NULL;

int nfilters=0;
FILTER Filter[MAXFILTER];

int in_stop_list=0;
char **stoplist;

static int alarm_abort;
static void sig_alarm(int i){ 
	alarm_abort = 1;
}

#ifndef HAVE_TM_GMTOFF
int get_gmtoff()
{
time_t tt = time(NULL);
struct tm gmt;
struct tm *t;
int days, hours, minutes;

	gmt = *gmtime(&tt);
	t = localtime(&tt);
	days = t->tm_yday - gmt.tm_yday;
	hours = ((days < -1 ? 24 : 1 < days ? -24 : days * 24)
		+ t->tm_hour - gmt.tm_hour);
	minutes=hours * 60 + t->tm_min - gmt.tm_min;
	return(minutes*60);
}
#endif

int cmpserver(const void *s1,const void *s2)
{
	return(strlen(((SERVER*)s2)->url)-strlen(((SERVER*)s1)->url));
}


int FreeFilters(void){
int i;
	for(i=0; i < nfilters; i++)
		regfree(&(Filter[i].filter));
	return(0);
}


int AddFilter(char * filter,int filter_type){
#define ERRSTRSIZE 100
char errstr[ERRSTRSIZE];
int err;
	if(nfilters<MAXFILTER){
		Filter[nfilters].filter_type=filter_type;
		err=regcomp(&(Filter[nfilters].filter),filter,REG_EXTENDED|REG_ICASE);
		if(err){
			fprintf(stderr, "Config file error: wrong  regex: %s", filter);
			regerror(err, &(Filter[nfilters].filter), errstr, ERRSTRSIZE);
			fprintf(stderr, "Error is: %s", errstr);
			regfree(&(Filter[nfilters].filter));
			return(1);
		}
		nfilters++;
	}else{
		fprintf(stderr, "Too many allow/disallow/checkonly commands in config file\nTry to increment MAXFILTER in indexer.h and recompile");
		return(1);
	}
	return(0);
}

/* Allow/Disallow/CheckOnly */
int WhatToDo(char *url){
int i;
#define NS 10
regmatch_t subs[NS];
int err;
	for(i=0;i<nfilters;i++){
		err=regexec(&(Filter[i].filter),url,NS,subs,0);
		if(!err)return(Filter[i].filter_type);
	}
	return(F_ALLOW);
}


int LoadStopList(){
MYSQL_RES *res;
MYSQL_ROW row;
int i=0;

	if(safe_mysql_query(&mysql,"SELECT word FROM stopword"))
		abort_indexer();
	if(res=mysql_store_result(&mysql)){
		in_stop_list=mysql_num_rows(res);
		stoplist=(char **)malloc(in_stop_list*sizeof(char *));
		while(row=mysql_fetch_row(res)){
			stoplist[i]=strdup(row[0]);
			i++;
		}
		mysql_free_result(res);
	}
}



int is_stopword(char *word)
{
int i;
	for(i=0;i<in_stop_list;i++){
		if(!strcmp(word,stoplist[i]))
			return(1);
	}
	return(0);
}


int AddServer(char *url,char *charset,char *proxy,
	int period,int hint,char *basic_auth,
	int outside,int maxhops,
	int gindex,int deletebad,int userobots,int gfollow,
	int bodyweight,int  titleweight,int  urlweight,
	int descweight,int  keywordweight,int max_net_errors,
	int read_timeout)
{
char *s;

	if(nservers>=mservers){
		if(mservers){
			mservers+=16;
			Server=(SERVER *)realloc(Server,mservers*sizeof(SERVER));
		}else{
			mservers=16;
			Server=(SERVER *)malloc(mservers*sizeof(SERVER));
		}
	}
	Server[nservers].url=strdup(url);
	Server[nservers].charset=charset?strdup(charset):NULL;
	Server[nservers].basic_auth=basic_auth?strdup(basic_auth):NULL;
	if(proxy){
		Server[nservers].proxy=strdup(proxy);
		s=strchr(Server[nservers].proxy,':');
		if(s){
			*s=0;
			Server[nservers].proxy_port=atoi(s+1)?atoi(s+1):DEFAULT_PROXY_PORT;
		}else{
			Server[nservers].proxy_port=DEFAULT_PROXY_PORT;
		}
	}else{
		Server[nservers].proxy=NULL;
		Server[nservers].proxy_port=0;
	}
	Server[nservers].period=period;
	Server[nservers].hint=hint;
	Server[nservers].net_errors=0;
	Server[nservers].outside=outside;
	Server[nservers].maxhops=maxhops;
	Server[nservers].gindex=gindex;
	Server[nservers].gfollow=gfollow;
	Server[nservers].deletebad=deletebad;
	Server[nservers].userobots=userobots;
	Server[nservers].bodyweight=bodyweight;
	Server[nservers].titleweight=titleweight;
	Server[nservers].urlweight=urlweight;
	Server[nservers].descweight=descweight;
	Server[nservers].keywordweight=keywordweight;
	Server[nservers].max_net_errors=max_net_errors;
	Server[nservers].read_timeout=read_timeout;
	nservers++;
	return(0);
}

int FindServer(char *url){
int i;
char *r;
	if(!Server)return(-1);
	url=remove2dot(url);
	r=strstr(url,"/robots.txt");
	if((r)&&(!strcmp(r,"/robots.txt"))){
		*r=0;
		for(i=0;i<nservers;i++){
			if(!STRNCMP(Server[i].url,url)){
				*r='/';
				return(i);
			}
		}
		*r='/';
	}else{
		for(i=0;i<nservers;i++)
			if(!STRNCMP(url,Server[i].url))
				return(i);
	}
	return(-1);
}

int FindRobots(char *url){
int i;
	for(i=0;i<nrobots;i++)
		if(strstr(url,Robots[i].hostinfo)&&strstr(url,Robots[i].path))
			return(i);
	return(-1);
}

int AddURL(char *url,int referrer,int hops)
{
int i;
char *e_url,*s;
	e_url=(char*)malloc(strlen(url)*2);
	mysql_escape_string(e_url,url,strlen(url));
	sprintf(qbuf,"INSERT INTO url (url,referrer,hops) VALUES ('%s',%d,%d)",e_url,referrer,hops);
	free(e_url);
	if(safe_mysql_query(&mysql,qbuf)){
		if(mysql_errno(&mysql)!=ER_DUP_ENTRY){
			abort_indexer();
		}
	}
	return(0);
}

int DeleteWordFromURL(int url_id)
{
	sprintf(qbuf,"DELETE FROM dict WHERE url_id=%d",url_id);
	/*sprintf(qbuf,"UPDATE dict SET intag=0 WHERE url_id=%d",url_id);*/
	if(safe_mysql_query(&mysql,qbuf)) abort_indexer();
	return(0);
}

int DeleteRobotsFromHost(char *hostinfo)
{
	sprintf(qbuf,"DELETE FROM robots WHERE hostinfo='%s'",hostinfo);
	if(safe_mysql_query(&mysql,qbuf)) abort_indexer();
	return(0);
}

int DeleteUrl(int url_id)
{
	sprintf(qbuf,"DELETE FROM dict WHERE url_id=%d",url_id);
	if(safe_mysql_query(&mysql,qbuf)) abort_indexer();
	sprintf(qbuf,"DELETE FROM url WHERE rec_id=%d",url_id);
	if(safe_mysql_query(&mysql,qbuf)) abort_indexer();
	return(0);
}

int UpdateUrl(int url_id,int status,int period){
	sprintf(qbuf,"UPDATE url SET status=%d,last_index_time=NOW(),next_index_time=from_unixtime(unix_timestamp()+%d) WHERE rec_id='%d'",
		status,period,url_id);
	if(safe_mysql_query(&mysql,qbuf))
		abort_indexer();
}

int AddWord(char *word,int where,int checkstop)
{
int i;
char *s;

	if(!word[0])return(0);
	s=strdup(word);
	tolower_string(s,localcharset);
	if(checkstop){
		if(is_stopword(s)){
			free(s);return(0);
		}
	}
	for(i=0;i<wcur;i++){
		if((!strcmp(Word[i].word,s))&&(Word[i].count<256)){
			Word[i].count|=where;
			free(s);return(0);
		}
	}
	if(wcur>=wsize){
		if(wsize){
			wsize+=WSIZE;
			Word=(WORD *)realloc(Word,wsize*sizeof(WORD));
		}else{
			wsize=WSIZE;
			Word=(WORD *)malloc(wsize*sizeof(WORD));
		}
	}
	Word[wcur].word=s;
	Word[wcur].count=where;
	wcur++;return(0);
}

int FreeRobots(){
	int i;
	for(i=0;i<nrobots;i++){
		free(Robots[i].hostinfo);
		free(Robots[i].path);
	}
	free_it(Robots);nrobots=0;
	return(0);
}

int LoadRobots(){
MYSQL_RES *res;
MYSQL_ROW row;
int i=0;
	FreeRobots();
	if(safe_mysql_query(&mysql,"SELECT hostinfo,path FROM robots"))
		abort_indexer();
	if(res=mysql_store_result(&mysql)){
		nrobots=mysql_num_rows(res);
		if(nrobots)Robots=(ROBOT *)malloc(nrobots*sizeof(ROBOT));
		while(row=mysql_fetch_row(res)){
			Robots[i].hostinfo=strdup(row[0]);
			Robots[i].path=strdup(row[1]);
			i++;
		}
		mysql_free_result(res);
	}
	return(0);
}

int StoreWords(int url_id)
{
int i,j,old,new;
MYSQL_RES *res;
MYSQL_ROW row;
int were,now,added,deleted,updated;

	were=now=added=deleted=updated=0;
	now=wcur;
	sprintf(qbuf,"SELECT word,intag FROM dict WHERE url_id=%d",url_id);
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	if(res=mysql_store_result(&mysql)){
		while(row=mysql_fetch_row(res)){
			if(!row[0])continue;
			if(!row[1])continue;
			AddWord(row[0],atoi(row[1])*256,0);
			were++;
		}
		mysql_free_result(res);
	}
	sprintf(qbuf,"LOCK TABLES dict WRITE");
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();

	j=0;
	for(i=0;i<wcur;i++){
		if(j>1024){
			j=0;
			sprintf(qbuf,"UNLOCK TABLES");
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			sprintf(qbuf,"LOCK TABLES dict WRITE");
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
		}
		old=(Word[i].count/256)&255;
		new=Word[i].count&255;
		if((!new)&&(old)){
			sprintf(qbuf,"DELETE FROM dict WHERE url_id=%d AND word='%s'",url_id,Word[i].word);
			/*sprintf(qbuf,"UPDATE dict SET intag=0 WHERE url_id=%d AND word='%s'",url_id,Word[i].word);*/
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			j++; deleted++;
			continue;
		}
		if((new)&&(!old)){
			sprintf(qbuf,"INSERT INTO dict (url_id,word,intag) VALUES(%d,'%s',%d)",url_id,Word[i].word,Word[i].count);
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			j++; added++;
			continue;
		}
		if((new)&&(old)&&(new!=old)){
			sprintf(qbuf,"UPDATE dict SET intag=%d WHERE word='%s' AND url_id='%d'",new,Word[i].word,url_id);
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			j++; updated++;
			continue;
		}
	}
	sprintf(qbuf,"UNLOCK TABLES");
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	return(deleted||added||updated);
}

int FreeWords() {
	int i;
	for(i=0;i<wcur;i++) free(Word[i].word);
	wcur=0;
	return(0);
}




#define BASE64_LEN(len) (4 * (((len) + 2) / 3) +2)
/* BASE64 encoding: 3x8 bits -> 4x6 bits */
static char base64[64] = 
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

void base64_encode (const char *s, char *store, int length)
{
int i;
unsigned char *p = (unsigned char *)store;

	for (i = 0; i < length; i += 3){
		*p++ = base64[s[0] >> 2];
		*p++ = base64[((s[0] & 3) << 4) + (s[1] >> 4)];
		*p++ = base64[((s[1] & 0xf) << 2) + (s[2] >> 6)];
		*p++ = base64[s[2] & 0x3f];
		s += 3;
	}
	/*Pad the result*/
	if (i == length + 1) *(p - 1) = '=';
	else if (i == length + 2) *(p - 1) = *(p - 2) = '=';
	*p = '\0';
}

char *HTTPErrMsg(int code){
	switch(code){
	case 0:   return("Not indexed yet");
	case 200: return("OK");
	case 301: return("Moved Permanently");
	case 302: return("Moved Temporarily");
	case 303: return("See Other");
	case 304: return("Not Modified");
	case 300: return("Multiple Choices");
	case 305: return("Use Proxy (proxy redirect)");
	case 400: return("Bad Request");
	case 401: return("Unathorized");
	case 402: return("Payment Required");
	case 403: return("Forbidden");
	case 404: return("Not found");
	case 405: return("Method Not Allowed");
	case 406: return("Not Acceptable");
	case 407: return("Proxy Authentication Required");
	case 408: return("Request Timeout");
	case 409: return("Conflict");
	case 410: return("Gone");
	case 411: return("Length Required");
	case 412: return("Precondition Failed");
	case 413: return("Request Entity Too Large");
	case 414: return("Request-URI Too Long");
	case 415: return("Unsupported Media Type");
	case 500: return("Internal Server Error");
	case 501: return("Not Implemented");
	case 502: return("Bad Gateway");
	case 505: return("HTTP Version Not Supported");
	case 503: return("Service Unavailable");
	case 504: return("Gateway Timeout");
	default:  return("Unknown status");
	}
}

int HTTPGet(char *host,int port,char *header,char *dist,int maxsize,int read_timeout){
int fd,i,size;
char *s;
void *old_alarm;

	/* Connect to HTTP or PROXY server */
	fd=open_host(host,port,read_timeout);	/* Connect           */
	if(fd<0)return(fd);			/* Could not connect */

	alarm_abort=0;alarm(read_timeout);
	old_alarm=signal(SIGALRM,sig_alarm);
	write(fd,header,strlen(header));/* Send HTTP request */
	size=0;s=dist;			/* Retrieve response */
	while(i=read(fd,s,maxsize-1-size)){
		size+=i;s+=i;
		if(alarm_abort)break;
		else signal(SIGALRM,sig_alarm);
	}
	close(fd);/* Close HTTP connection */
	alarm(0);signal(SIGALRM,old_alarm);
	if(alarm_abort)return(-3);
	return(size);
}

int usage(){
	fprintf(stderr,
"
indexer from UdmSearch v." VERSION ". http://search.udm.net/about/
(C) 1998-1999, UdmSearch Developers Team.

Usage: indexer [OPTIONS]  [configfile]

Indexing options:
  -a            reindex All documents (even if not expired)
  -m            reindex if not Modified
  -e            more Expired first
  -n number     index only Number documents and exit
  -p number     sleeP number seconds arter each URL
  -q            quick startup (do not add Server URLs)

Subsection control options:
  -t tag        set filter on Tag
  -u pattern    set filter on URL (SQL LIKE wildcards)
  -s status     set filter on HTTP Status

Logging options:
"
#ifdef LOG_PERROR
"  -l            do not Log to stdout/stderr
"
#endif
"  -v number     Verbose level, 0-5

Ispell import options:
  -L language   Two letters Language code (en, ru, de etc.)
  -A filename   ispell Affix file
  -D filename   ispell Dictionary file
  -d            dump to stdout instead of storing to database
  
Mics. options:
  -C            Clear database and exit
  -S            print Statistics and exit
  -I            print referers and exit
  -h,-?         print this Help page and exit


Mail bug reports and suggestions to <devel@search.udm.net>.
");
	return(0);
}

int ClearDatabase(char *tagstr,char *urlstr,char *statusstr){
	MYSQL_RES *res;
	MYSQL_ROW row;
	char urlin[STRSIZ]="";char str[STRSIZ]="";
	int i,j;
	
	printf("You are going to delete database contents\n");
	printf("Are you sure?(YES/no)\n");
	if(fgets(str,sizeof(str),stdin)){
		if(strncmp(str,"YES",3))
			exit(0);
	}else
		exit(0);

	if(!tagstr[0]&&!urlstr[0]&&!statusstr[0]){
		printf("Deleting all records\n");
		sprintf(qbuf,"DELETE FROM dict");
		if(safe_mysql_query(&mysql,qbuf))abort_indexer();
		sprintf(qbuf,"DELETE FROM url");
		if(safe_mysql_query(&mysql,qbuf))abort_indexer();
		printf("Done\n");
	}else{
		printf("Deleting\n");j=0;
		while(1){
			sprintf(qbuf,"SELECT rec_id FROM url WHERE 1=1 %s %s %s LIMIT %d",tagstr,urlstr,statusstr,URL_DELETE_CACHE);
			if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			if(res=mysql_store_result(&mysql)){
				if(!mysql_num_rows(res))
					break;
				urlin[0]=0;
				for(i=0;i<mysql_num_rows(res);i++){
					row=mysql_fetch_row(res);
					if(i)strcat(urlin,",");
					strcat(urlin,row[0]);
				}
				j+=i;
				mysql_free_result(res);
				sprintf(qbuf,"DELETE FROM dict WHERE url_id in (%s)",urlin);
				if(safe_mysql_query(&mysql,qbuf))abort_indexer();
				sprintf(qbuf,"DELETE FROM url WHERE rec_id in (%s)",urlin);
				if(safe_mysql_query(&mysql,qbuf))abort_indexer();
				printf("%d records\r",j);fflush(stdout);
			}
		}
		printf("\nDone\n");
	}
	exit(0);
}

	
int ShowStatistics(char *tagstr,char *urlstr,char *statusstr){
	MYSQL_RES *res;
	MYSQL_ROW row;
	int indexed_total=0;
	int total_total=0;
	int i,j;

	printf("\n          UdmSearch statistics\n\n");
	sprintf(qbuf,"SELECT status,sum(next_index_time<=NOW()),count(*) FROM url WHERE 1=1 %s %s %s GROUP BY status",tagstr,urlstr,statusstr);
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	if(res=mysql_store_result(&mysql)){
		indexed_total=total_total=0;
		j=mysql_num_rows(res);

		printf("%10s %10s %10s\n","Status","Expired","Total");
		printf("   -----------------------------\n");
		for(i=0;i<j;i++){
			row=mysql_fetch_row(res);
			printf("%10s %10s %10s %s\n",
				row[0],row[1],row[2],
				HTTPErrMsg(atoi(row[0])));
			indexed_total+=atoi(row[1]);
			total_total+=atoi(row[2]);
		}
		mysql_free_result(res);
		printf("   -----------------------------\n");
		printf("%10s %10d %10d\n\n","Total",indexed_total,total_total);
	}
	exit(0);
}

int ShowReferers(char *tagstr,char *urlstr,char *statusstr){
	MYSQL_RES *res;
	MYSQL_ROW row;

	printf("\n          URLs and referers \n\n");
	sprintf(qbuf,"SELECT url2.url,url.url,url.status FROM url,url url2 WHERE url.referrer=url2.rec_id %s %s %s ",tagstr,urlstr,statusstr);
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	if(res=mysql_store_result(&mysql)){
		while(row=mysql_fetch_row(res)){
			printf("%s -> %s (%s)\n",
				row[0],row[1],row[2]);
		}
		mysql_free_result(res);
	}
	exit(0);
}

int ParseRobots(char *content,char *hostinfo){
	int myrule=0;
	char *s,*e;

	sprintf(qbuf,"DELETE FROM robots WHERE hostinfo='%s'",hostinfo);
	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	s=strtok(content,"\r\n");
	while(s){
		if(*s=='#'){
		}else
		if(!(STRNCASECMP(s,"User-Agent"))){
			if(strstr(s,INDEXER_USER_AGENT))myrule=1;
			else
			if(strstr(s,"*"))myrule=1;
			else
				myrule=0;
		}else
		if((!(STRNCASECMP(s,"Disallow")))&&(myrule)){
			if(e=strchr(s+9,'#'))*e=0;
			e=s+9;SKIP(e," \t");s=e;
			SKIPN(e," \t");*e=0;
			if(*s){
				sprintf(qbuf,"INSERT INTO robots (hostinfo,path) VALUES ('%s','%s')",hostinfo,s);
				if(safe_mysql_query(&mysql,qbuf))abort_indexer();
			}
		}
		s=strtok(NULL,"\n\r");
	}
}


int FindOrigin(char *crc){
MYSQL_RES *res;
MYSQL_ROW row;
int i=0;

	if(!crc)return(0);
	if(!(*crc))return(0);

	sprintf(qbuf,"SELECT min(rec_id) FROM url WHERE crc='%s' AND indexed>0",crc);
	if(safe_mysql_query(&mysql,qbuf))
		abort_indexer();
	if(res=mysql_store_result(&mysql)){
		if(row=mysql_fetch_row(res)){
			if(row[0])
				i=atoi(row[0]);
		}
		mysql_free_result(res);
	}
	return(i);
}


main(int argc, char **argv) {
extern char *optarg;
extern int optind;
char tagstr[STRSIZ]="",urlstr[STRSIZ]="",statusstr[STRSIZ]="";
char urlin[STRSIZ]="",str[STRSIZ]="",str1[STRSIZ]="";
char text[2*MAXTEXTSIZE]="";      char text_escaped[2*MAXTEXTSIZE+1]="";
char keywords[MAXKEYWORDSIZE]=""; char keywords_escaped[2*MAXKEYWORDSIZE+1]="";
char descript[MAXDESCSIZE]="";    char descript_escaped[2*MAXDESCSIZE+1]="";
char last_modified[32]="";        char last_modified_escaped[32*2+1]="";
char title[MAXTITLESIZE]="";      char title_escaped[MAXTITLESIZE*2+1];
char get_url_buf[STRSIZ]="",user_agent[256]="";
char SEPARATOR[256]="";
char digest[33];
MYSQL_RES *urlres;
MYSQL_RES *res;
MYSQL_ROW row;
FILE  *config;
URL from,newurl;
TAG tag;
char *buf,*s,*e,*tmp,sch;
char *cururl,*last_index_time;
char *content,*header,*content_type,*location,*href,*statusline,*crc;
char *mime;
int  mimeno;
char *charset=NULL,*proxy=NULL,*basic_auth=NULL;
char *language=NULL,*affix=NULL,*dictionary=NULL;
char *config_name= CONFIGDIR "/indexer.conf";


int inbody,intitle,inscript,follow,index,changed,ch,i,j,state,len,currow,nrows;
int gindex=1,gfollow=1,deletebad=1,userobots=1,npages=-1,npages0=0,efirst=0;
int bodyweight=1,titleweight=2,keywordweight=2,descweight=2,urlweight=0;
int reindexall=0,done=0,clear=0,stat=0,integrity=0,reload_robots=1;
int myrule=0,gmt_offset=0,outside=0,origin=0,clone=0;
int max_net_errors=MAXNETERRORS,read_timeout=READ_TIMEOUT;
int url_id,size,oldsize,oldstatus,status,hops,hint,realsize;
int period=DEFAULT_REINDEX_TIME,maxhops=DEFAULT_MAX_HOPS;
int Method=0,add_servers=1,dump=0,seconds=0,maxdocsize=MAXDOCSIZE;
struct tm *tim;
time_t clock;
int log2stderr=1;
int reindex=0;

	if(e=getenv("LANG"))setlocale(LC_ALL,e);

#ifdef HAVE_TM_GMTOFF
	clock=time(0);tim=localtime(&clock);
	gmt_offset=tim->tm_gmtoff;
#else
	gmt_offset=get_gmtoff();
#endif

	s=SEPARATOR;*s=0;
	for(i=1;i<255;i++)
		if(!strchr(WORDCHAR,i)){
			*s=i;s++;*s=0;
		}

	/* Parse arguments */
	while ((ch = getopt(argc, argv, "CSIahelmdq?t:u:s:n:v:L:A:D:p:")) != -1){
		switch (ch) {

		case 'C': clear++;add_servers=0;break;	
		case 'S': stat++;add_servers=0;break;
		case 'I': integrity++;add_servers=0;break;
		case 'L': language=optarg;break;
		case 'A': affix=optarg;add_servers=0;break;
		case 'D': dictionary=optarg;add_servers=0;break;
		case 'q': add_servers=0;break;
		case 'l': log2stderr=0;break;
		case 'a': reindexall=1;break;
		case 'e': efirst=1;break;
		case 'm': reindex=1;break;
		case 'n': npages=atoi(optarg);npages0=npages;break;
		case 'v': loglevel=atoi(optarg);break;
		case 'p': seconds=atoi(optarg);break;
		case 'd': dump=1;break;
		case 't':
			sprintf(str,"%d",atoi(optarg));
			if(*tagstr){
				strcat(tagstr,",");
			}else{
				strcat(tagstr," AND url.tag IN (");
			}
			strcat(tagstr,str);
			break;
		case 's':
			sprintf(str,"%d",atoi(optarg));
			if(*statusstr){
				strcat(statusstr,",");
			}else{
				strcat(statusstr," AND url.status IN (");
			}
			strcat(statusstr,str);
			break;
		case 'u':
			if(*urlstr){
				strcat(urlstr," OR ");
			}else{
				strcat(urlstr," AND (");
			}
			strcat(urlstr,"url.url LIKE '");
			strcat(urlstr,optarg);
			strcat(urlstr,"'");
			break;
		
		case '?':
		case 'h':
		default:
			usage();
			exit(1);
		}
	}
	argc -= optind;
	argv += optind;
	if(argc>1){
		usage();
		exit(1);
	}
	if (argc==1)config_name=argv[0];

	if(*tagstr)strcat(tagstr,")");
	if(*statusstr)strcat(statusstr,")");
	if(*urlstr)strcat(urlstr,")");

	/* Load config file */
	if(!(config=fopen(config_name,"r"))){
		fprintf(stderr, "Error - can't open config file %s: error%s\r\n",
			config_name, strerror(errno));
		exit(1);
	}

	i=0;
	while(fgets(str,sizeof(str),config)){
		i++; strcpy(str1,"");
		if(!strcmp(str,"\n"))
			continue;
		if(str[0]=='#')
			continue;
		if(!STRNCASECMP(str,"MySQLHost")){
			if(sscanf(str+9,"%s",str1))
				MySQLHost=strdup(str1);
		}else
		if(!STRNCASECMP(str,"MySQLDB")){
			if(sscanf(str+7,"%s",str1)) 
				MySQLDB=strdup(str1);
		}else
		if(!STRNCASECMP(str,"MySQLUser")){
			if(sscanf(str+9,"%s",str1)&&strcmp(str1,""))
				MySQLUser=strdup(str1);
		}else
		if(!STRNCASECMP(str,"LocalCharset")){
			if(sscanf(str+13,"%s",str1)&&strcmp(str1,"")){
				free_it(localcharset);
				localcharset=strdup(str1);
			}
		}else
		if(!STRNCASECMP(str,"MySQLPass")){
			if(sscanf(str+9,"%s",str1)&&strcmp(str1,""))
				MySQLPass=strdup(str1);
			else
				MySQLPass=strdup("");
		}else
		if(!STRNCASECMP(str,"disallow")){
			s=strtok(str," \t\r\n");
			while(s=strtok(NULL," \t\r\n")){
				if(AddFilter(s,F_DISALLOW))
					exit(1);
			}
		}else
		if(!STRNCASECMP(str,"allow")){
			s=strtok(str," \t\r\n");
			while(s=strtok(NULL," \t\r\n")){
				if(AddFilter(s,F_ALLOW))
					exit(1);
			}
		}else
		if(!STRNCASECMP(str,"CheckOnly")){
			s=strtok(str," \t\r\n");
			while(s=strtok(NULL," \t\r\n")){
				if(AddFilter(s,F_HEAD))
					exit(1);
			}
		}else
		if(!STRNCASECMP(str,"Tag")){
			if(sscanf(str+4,"%d",&hint)!=1)
				hint=0;
		}else
		if(!STRNCASECMP(str,"MaxNetErrors")){
			if(sscanf(str+12,"%d",&max_net_errors)!=1)
				max_net_errors=MAXNETERRORS;
		}else
		if(!STRNCASECMP(str,"ReadTimeOut")){
			if(sscanf(str+11,"%d",&read_timeout)!=1)
				read_timeout=READ_TIMEOUT;
		}else
		if(!STRNCASECMP(str,"Period")){
			if(sscanf(str+6,"%d",&period)!=1)
				period=DEFAULT_REINDEX_TIME;
		}else
		if(!STRNCASECMP(str,"CharSet")){
			free_it(charset);
			if(sscanf(str+7,"%s",str1))
				charset=strdup(str1);
		}else
		if(!STRNCASECMP(str,"Proxy")){
			free_it(proxy);
			if(sscanf(str+5,"%s",str1)&&strcmp(str1,""))
				proxy=strdup(str1);
		}else
#ifdef HAVE_SYSLOG_H 
		if(!STRNCASECMP(str,"SyslogFacility")){
			if(sscanf(str+14,"%s",str1)&&strcmp(str1,""))
				log_facility=syslog_facility(str1);
		}else
#endif
		if(!STRNCASECMP(str,"AuthBasic")){
			free_it(basic_auth);
			if(sscanf(str+9,"%s",str1)){
				basic_auth=(char*)malloc(BASE64_LEN(strlen(str1)));
				base64_encode(str1,basic_auth,strlen(str1));
			}
		}else
		if(!STRNCASECMP(str,"Mime")){
			AddMime(str+4);
		}else
		if(!STRNCASECMP(str,"Server")){
			if(sscanf(str+6,"%s",str1)==1){
				if(!parseurl(&from,str1)){
					AddServer(str1,charset,proxy,period,
					hint,basic_auth,outside,maxhops,
					gindex,deletebad,userobots,gfollow,
					bodyweight,titleweight,urlweight,
					descweight,keywordweight,
					max_net_errors,read_timeout);
					if((!strcmp(from.schema,"http"))
							&&add_servers){
						sprintf(str,"%s://%s/%s",
							from.schema,
							from.hostinfo,
							"robots.txt");
						AddURL(str,0,0);
					}
					if(add_servers)
						AddURL(str1,0,0);
				}
			}else{
				fprintf(stderr, "Error in config file line %d:%s",i,str);
				exit(1);
			}
		}else
		if(!STRNCASECMP(str,"FollowOutside")){
			if(sscanf(str+13,"%s",&str1)==1){
				if(!STRNCASECMP(str1,"yes"))
					outside=1;
				else
					outside=0;
			}else
				outside=0;
		}else
		if(!STRNCASECMP(str,"Index")){
			if(sscanf(str+5,"%s",&str1)==1){
				if(!STRNCASECMP(str1,"yes"))
					gindex=1;
				else
					gindex=0;
			}else
				gindex=1;
		}else
		if(!STRNCASECMP(str,"Follow ")){
			if(sscanf(str+7,"%s",&str1)==1){
				if(!STRNCASECMP(str1,"yes"))
					gfollow=1;
				else
					gfollow=0;
			}else
				gfollow=1;
		}else
		if(!STRNCASECMP(str,"Robots")){
			if(sscanf(str+6,"%s",&str1)==1){
				if(!STRNCASECMP(str1,"yes"))
					userobots=1;
				else
					userobots=0;
			}else
				userobots=1;
		}else
		if(!STRNCASECMP(str,"DeleteBad")){
			if(sscanf(str+9,"%s",&str1)==1){
				if(!STRNCASECMP(str1,"yes"))
					deletebad=1;
				else
					deletebad=0;
			}else
				deletebad=1;
		}else
		if(!STRNCASECMP(str,"MaxHops")){
			if(sscanf(str+7,"%d",&maxhops)!=1)
				maxhops=0;
		}else
		if(!STRNCASECMP(str,"BodyWeight")){
			if(sscanf(str+10,"%d",&bodyweight)!=1)
				bodyweight=1;
		}else
		if(!STRNCASECMP(str,"TitleWeight")){
			if(sscanf(str+11,"%d",&titleweight)!=1)
				titleweight=2;
		}else
		if(!STRNCASECMP(str,"DescWeight")){
			if(sscanf(str+10,"%d",&descweight)!=1)
				descweight=2;
		}else
		if(!STRNCASECMP(str,"UrlWeight")){
			if(sscanf(str+9,"%d",&urlweight)!=1)
				urlweight=0;
		}else
		if(!STRNCASECMP(str,"KeywordWeight")){
			if(sscanf(str+13,"%d",&keywordweight)!=1)
				keywordweight=2;
		}else
		if(!STRNCASECMP(str,"MaxDocSize")){
			if(sscanf(str+11,"%d",&maxdocsize)!=1)
				maxdocsize=MAXDOCSIZE;
		}else
		if(!STRNCASECMP(str,"NoIndex")){
			gindex=0;
		}else
		if(!STRNCASECMP(str,"NoFollow")){
			gfollow=0;
		}else{
			fprintf(stderr,"Error in config file line %d:%s",i,str);
			exit(1);
		}
	}
	fclose(config);

	/* Open log */
	udm_openlog(log_facility,log2stderr);

	if(affix||dictionary){
		if(!language){
			fprintf(stderr,"Language is not specified for import!\n");
			exit(1);
		}
		if(strlen(language)!=2){
			fprintf(stderr,"Language should be 2 letters\n");
			exit(1);
		}
		if(affix)ImportAffixes(language,affix,dump);
		if(dictionary)ImportDictionary(language,dictionary,dump);
		exit(0);
	}
	if(clear){
		ClearDatabase(tagstr,urlstr,statusstr);
		exit(0);
	}
	if(stat){
		ShowStatistics(tagstr,urlstr,statusstr);
		exit(0);
	}
	if(integrity){
		ShowReferers(tagstr,urlstr,statusstr);
		exit(0);
	}

	udm_log(INFO, "indexer from UdmSearch v." VERSION" started with config %s", config_name);

	/* Add one virtual server if we want to follow outside */
	if(outside)
		AddServer("",charset,proxy,period,hint,
			basic_auth,outside,maxhops,gindex,
			deletebad,userobots,gfollow,
			bodyweight,titleweight,urlweight,
			descweight,keywordweight,
			max_net_errors,read_timeout);

	free_it(charset);free_it(proxy);free_it(basic_auth);

	/*  Long name should be found first    */
	/*  to allow different options         */
	/*  for server and it's subdirectories */
	qsort((void*)Server,nservers,sizeof(SERVER),cmpserver);

	LoadStopList();

	if(reindexall){ /* option -a given */
		sprintf(qbuf,"UPDATE url SET next_index_time=NOW() WHERE 1=1 %s%s%s",tagstr,urlstr,statusstr);
		if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	}

	/* Some useful constants */
	sprintf(get_url_buf,
"SELECT url,rec_id,size,status,concat(date_format(from_unixtime(unix_timestamp(last_index_time)+(%d)),'%%a, %%e %%b %%Y %%T'),' GMT'),hops,crc FROM url WHERE next_index_time<=NOW() %s %s %s %s LIMIT %d",
	gmt_offset,tagstr,urlstr,statusstr,efirst?" ORDER BY next_index_time ":"",URL_SELECT_CACHE);
	sprintf(user_agent,"%s/%s",INDEXER_USER_AGENT,VERSION);

	/* No url retrieved yet */
	currow=1;nrows=0;urlres=NULL;

	/* Alloc buffer for document */
	buf=(char*)malloc(maxdocsize);

/***************************************************************/
/****************** Now start indexing ... *********************/
/***************************************************************/

while(1){
	if(!npages){
		udm_log(INFO,"%d documents done",npages0);
		exit(0);
	}
	if(npages>0)npages--;

	if(reload_robots){
		LoadRobots();
		reload_robots=0;
	}
	if(currow>=nrows){  /* Get next URL_CACHE records */
		udm_log(DEBUG,"Locking next %d URLs",URL_SELECT_CACHE);
		if(urlres)mysql_free_result(urlres);
		if(safe_mysql_query(&mysql,"LOCK TABLES url WRITE"))abort_indexer();
		if(safe_mysql_query(&mysql,get_url_buf))abort_indexer();
		if(!(urlres=mysql_store_result(&mysql)))
			done=1;
		else
		if(!(mysql_num_rows(urlres))){
			mysql_free_result(urlres);
			done=1;
		}
		if(done){
			if(safe_mysql_query(&mysql,"UNLOCK TABLES"))abort_indexer();
			udm_log(INFO,"Done");
			mysql_close(&mysql);
			return(0);
		}
		urlin[0]=0;
		while(row=mysql_fetch_row(urlres)){
			if(urlin[0])strcat(urlin,",");
			strcat(urlin,row[1]);
		}
		sprintf(qbuf,"UPDATE url SET next_index_time=from_unixtime(unix_timestamp()+3600) WHERE rec_id in (%s)",urlin);
		if(safe_mysql_query(&mysql,qbuf))abort_indexer();
		if(safe_mysql_query(&mysql,"UNLOCK TABLES"))abort_indexer();
		currow=0;
		nrows=mysql_num_rows(urlres);
		mysql_data_seek(urlres,0);
	}

	status=1;changed=1;
	inbody=intitle=clone=0;
	content_type=location=href=charset=statusline=NULL;
	cururl=0;last_index_time=0;digest[0]=0;
	text[0]=0;title[0]=0;keywords[0]=0;descript[0]=0;

	if(row=mysql_fetch_row(urlres)){
		cururl=row[0];
		url_id=atoi(row[1]);
		oldsize=atoi(row[2]);
		oldstatus=atoi(row[3]);
		last_index_time=row[4];
		hops=atoi(row[5]);
		crc=row[6]?row[6]:"___";
		done=0;
	}else{
		udm_log(ERROR,"Fatal error");exit(1);
	}
	currow++;

	udm_log(INFO,"%s ",cururl);

#ifdef HAVE_SETPROCTITLE
	setproctitle("%s",cururl);
#endif
	if(parseurl(&from,cururl)){
		DeleteUrl(url_id);
		udm_log(WARNING,"Invalid http or ftp url: %s ... deleted",cururl);
		freeurl(&from);
		continue;
	}
	if((i=FindServer(cururl))<0){
		DeleteUrl(url_id);
		if(!strcmp(from.filename,"robots.txt")){
			DeleteRobotsFromHost(from.hostinfo);
			reload_robots=1;
		}
		udm_log(WARNING,"No 'Server' command for url... deleted.");
		freeurl(&from);
		continue;
	}
	period=Server[i].period;
	hint=Server[i].hint;
	outside=Server[i].outside;
	maxhops=Server[i].maxhops;
	gindex=Server[i].gindex;index=gindex;
	gfollow=Server[i].gfollow;follow=gfollow;
	userobots=Server[i].userobots;
	deletebad=Server[i].deletebad;
	bodyweight=Server[i].bodyweight;
	titleweight=Server[i].titleweight;
	urlweight=Server[i].urlweight;
	descweight=Server[i].descweight;
	keywordweight=Server[i].keywordweight;
	read_timeout=Server[i].read_timeout;

	charset=Server[i].charset;

	if(hops>maxhops){
		DeleteUrl(url_id);
		udm_log(WARNING,"Too many hops (%d)... deleted.",hops);
		freeurl(&from);
		continue;
	}
	Method=WhatToDo(cururl);
	
	if(Method==F_DISALLOW){
		DeleteUrl(url_id);
		if(!strcmp(from.filename,"robots.txt")){
			DeleteRobotsFromHost(from.hostinfo);
			reload_robots=1;
		}
		udm_log(WARNING,"URL deleted due to 'Disallow' command");
		freeurl(&from);
		continue;
	}
	if((userobots)&&(FindRobots(cururl)>=0)){
		/* if robots.txt disallowed by himself */
		/* we will not delete it */
		if(strcmp(from.filename,"robots.txt")){
			DeleteUrl(url_id);
			udm_log(WARNING,"URL deleted due to robots.txt exclusion");
		}
		freeurl(&from);
		continue;
	}
	if(strcmp(from.schema,"http")&&strcmp(from.schema,"ftp")){
		DeleteUrl(url_id);
		udm_log(WARNING,"Unknown URL schema: %s",cururl);
		freeurl(&from);
		continue;
	}
	if((!strcmp(from.schema,"ftp"))&&(!Server[i].proxy)){
		udm_log(WARNING,"FTP is available via proxy only, skipped");
		UpdateUrl(url_id,504,period);
		freeurl(&from);
		continue;
	}
	if(Server[i].net_errors>Server[i].max_net_errors){
		udm_log(WARNING,"Too many network errors for this server, skipped");
		UpdateUrl(url_id,504,period);
		freeurl(&from);
		continue;
	}
	/* Compose HTTP header */
	sprintf(str,"%s %s%s HTTP/1.0\r\nHost: %s\r\n",
		(Method==F_HEAD)?"HEAD":"GET",
		Server[i].proxy?cururl:from.path,
		Server[i].proxy?"":from.filename,
		from.hostname);
		
	/* Auth if required */
	if(Server[i].basic_auth){
		strcat(str,"Authorization: Basic ");
		strcat(str,Server[i].basic_auth);
		strcat(str,"\r\n");
	}
	
	/* User agent */
	strcat(str,"User-Agent: ");strcat(str,user_agent);strcat(str,"\r\n");

	/* If document already indexed, may be it is not modified */
	if((oldstatus)&&(last_index_time[0])&&(!reindex)){
		strcat(str,"If-Modified-Since: ");
		strcat(str,last_index_time);
		strcat(str,"\r\n");
	}

	/* We prefer some charset ? */
	if(localcharset){
		strcat(str,"Accept-charset: ");
		strcat(str,localcharset);
		strcat(str,"\r\n");
	}

	/* Empty line is the end of HTTP header */
	strcat(str,"\r\n");

	size = HTTPGet(
		Server[i].proxy?Server[i].proxy:from.hostname,
		Server[i].proxy?Server[i].proxy_port:(from.port?from.port:80),
		str,buf,maxdocsize,read_timeout
	);

	if(size==-3){
		udm_log(WARNING,"Download timeout");
		UpdateUrl(url_id,504,period);
		Server[i].net_errors++;
		freeurl(&from);
		continue;
	}
	if(size<=0){
		udm_log(WARNING,"Could not connect to host %s:%d",
			Server[i].proxy?Server[i].proxy:from.hostname,
			Server[i].proxy?Server[i].proxy_port:(from.port?from.port:80));
		UpdateUrl(url_id,503,period);
		Server[i].net_errors++;
		freeurl(&from);
		continue;
	}

	/* Parse response header */
	buf[size]=0;header=buf;
	if(!(content=strstr(buf,"\r\n\r\n")))content=strstr(buf,"\n\n");
	if(!content){
		UpdateUrl(url_id,503,period);
		freeurl(&from);
		udm_log(ERROR,"URL has no content, skipped");
		Server[i].net_errors++;
		continue;
	}

	*content=0; content+=4; size-=(content-buf);
	realsize=size;

	s=strtok(header,"\r\n");
	while(s){
		
		if(!STRNCASECMP(s,"HTTP/")){
			status=atoi(s+8);
			statusline=s;
			udm_log(DEBUG,"%s",s);
		}else
		if(!STRNCASECMP(s,"Content-Type: ")){
			udm_log(DEBUG,"%s",s);
			content_type=s+14;
			if(e=strstr(content_type,"charset="))
				charset=e+8;
		}else
		if(!STRNCASECMP(s,"Location: ")){
			location=s+10;
			udm_log(DEBUG,"%s",s);
		}else
		if(!STRNCASECMP(s,"Content-Length: ")){
			udm_log(DEBUG,"%s",s);
			size=atoi(s+16);
		}else
		if(!STRNCASECMP(s,"Last-Modified: ")){
			udm_log(DEBUG,"%s",s);
			strncpy(last_modified,s+15,sizeof(last_modified));
			last_modified[sizeof(last_modified)-1]=0;
			mysql_escape_string(last_modified_escaped,last_modified,strlen(last_modified));
		}else{
			udm_log(DEBUG,"%s",s);
		}
		s=strtok(NULL,"\r\n");
	}
	udm_log(EXTRA,"%s %s %d",
		statusline?statusline:"?",
		content_type?content_type:"?",
		size);

	/* Now parse HTTP code*/
	switch(status){
	case 1: /* No HTTP code */
		UpdateUrl(url_id,status,period);
		udm_log(ERROR,"No http code!");
		Server[i].net_errors++;
		freeurl(&from);
		continue;

	case 200: /* OK */
		if(!content_type){
			UpdateUrl(url_id,status,period);
			udm_log(ERROR,"No Content-type in '%s'!",cururl);
			Server[i].net_errors++;
			freeurl(&from);
			continue;
		}
		break;

	case 301:  /* Moved Permanently */
	case 302:  /* Moved Temporarily */
	case 303:  /* See Other */
		/* We'll try to use Location: xxx instead */
		DeleteWordFromURL(url_id);
		if(hops<maxhops)
		if((location)&&(WhatToDo(location)!=F_DISALLOW)){
			if(!parseurl(&newurl,location)){
				if(FindServer(location)>=0)
					AddURL(location,url_id,hops+1);
				freeurl(&newurl);
			}else{
				udm_log(ERROR,"Error in redirect URL: '%s'",location);
			}
		}
		UpdateUrl(url_id,status,period);
		freeurl(&from);
		continue;

	case 304:  /* Not Modified, it's very good */
		UpdateUrl(url_id,status,period);
		freeurl(&from);
		continue;

	case 300:  /* Multiple Choices              */
	case 305:  /* Use Proxy (proxy redirect)    */
	case 400:  /* Bad Request ??? We tried ...  */
	case 401:  /* Unathorized                   */
	case 402:  /* Payment Required              */
	case 403:  /* Forbidden                     */
	case 404:  /* Not found                     */
	case 405:  /* Method Not Allowed            */
	case 406:  /* Not Acceptable                */
	case 407:  /* Proxy Authentication Required */
	case 408:  /* Request Timeout               */
	case 409:  /* Conflict                      */
	case 410:  /* Gone                          */
	case 411:  /* Length Required               */
	case 412:  /* Precondition Failed           */
	case 413:  /* Request Entity Too Large      */
	case 414:  /* Request-URI Too Long          */
	case 415:  /* Unsupported Media Type        */
	case 500:  /* Internal Server Error         */
	case 501:  /* Not Implemented               */
	case 502:  /* Bad Gateway                   */
	case 505:  /* HTTP Version Not Supported    */
		/* delete it if not robots.txt */
		if(deletebad){
			if(!strcmp(from.filename,"robots.txt")){
				UpdateUrl(url_id,status,period);
				DeleteRobotsFromHost(from.hostinfo);
				reload_robots=1;
			}else{
				udm_log(EXTRA,"URL deleted");
				DeleteUrl(url_id);
			}
		}else{
			UpdateUrl(url_id,status,period);
		}
		freeurl(&from);
		continue;

	case 503: /* Service Unavailable */
	case 504: /* Gateway Timeout */
		/* We'll retry later, maybe host is down */
		udm_log(EXTRA,"Could not read URL, skipped");
		UpdateUrl(url_id,status,period);
		Server[i].net_errors++;
		freeurl(&from);
		continue;

	default:
		/* retry later */
		udm_log(WARNING,"HTTP %d We don't yet know how to handle it, skipped",status);
		UpdateUrl(url_id,status,period);
		freeurl(&from);
		continue;
	}



	/* Now we have HTTP 200 and know content type */

	/* Let's try to start external parser,
	   if any found for this content type */
	if(mime=mime_parse(content_type,&mimeno,content,realsize,maxdocsize-(content-buf))){
		content=mime;
		content_type=mime_name(mimeno,MIME_TO);
		if(mime=mime_charset(mimeno)) charset=mime;
	};

	/* robots.txt */
	if(!STRNCASECMP(content_type,"text/plain")&&
	(!strcmp(from.filename,"robots.txt"))){
		ParseRobots(content,from.hostinfo);
		reload_robots=1;
	}else

	/* plain TEXT or the same */
	if(!STRNCASECMP(content_type,"text/plain")||
	!STRNCASECMP(content_type,"text/tab-separated-values")||
	!STRNCASECMP(content_type,"text/css"))
	{
		recode_string(content,charset,localcharset);
		MD5_DIGEST(content,realsize,digest);
		changed=strcmp(digest,crc);
		if((origin=FindOrigin(digest))&&(origin!=url_id))clone=1;

		if(((index)&&(!clone)&&(changed))||reindex){
			strncpy(text,content,sizeof(text));
			text[sizeof(text)-1]=0;
			s=strtok(content,SEPARATOR);
			while(s){
				if(bodyweight)AddWord(s,bodyweight,1);
				s=strtok(NULL,SEPARATOR);
			}
		}
	}else

	/* HTML text */
	if(!STRNCASECMP(content_type,"text/html")){

		recode_string(content,charset,localcharset);
		MD5_DIGEST(content,realsize,digest);
		changed=strcmp(digest,crc);
		if((origin=FindOrigin(digest))&&(origin!=url_id))clone=1;

		state=HTML_UNK; s=content;
		while((*s)&&(changed||reindex)){
		switch(state){
		case HTML_UNK: /* yet unknown */
			if(!strncmp(s,"<!--",4)){
				state=HTML_COM;
				break;
			}
			if(*s=='<'){
				state=HTML_TAG;
				break;
			}
			state=HTML_TXT;
			break;

		case HTML_COM: /* comment */
			e=strstr(s,"-->");
			if(e)len=e-s+3;
			else len=strlen(s);
			s=s+len; state=HTML_UNK;
			break;

		case HTML_TAG: /* tag */
			e=strchr(s,'>');
			if(e)len=e-s+1;
			else len=strlen(s);
			tmp=(char*)malloc(len+1);
			strncpy(tmp,s,len);tmp[len]=0;
			parsetag(&tag,tmp);

			if(!strcasecmp(tag.name,"body"))inbody=1;
			else
			if(!strcasecmp(tag.name,"/body"))inbody=0;
			else
			if(!strcasecmp(tag.name,"script"))inscript=1;
			else
			if(!strcasecmp(tag.name,"/script"))inscript=0;
			else
			if(!strcasecmp(tag.name,"title"))intitle=1;
			else
			if(!strcasecmp(tag.name,"/title"))intitle=0;
			else
			if((!strcasecmp(tag.name,"a"))&&(tag.href))
				href=strdup(tag.href);
			else
			if((!strcasecmp(tag.name,"area"))&&(tag.href))
				href=strdup(tag.href);
			else
			if((!strcasecmp(tag.name,"frame"))&&(tag.src))
				href=strdup(tag.src);
			else
			if((!strcasecmp(tag.name,"meta"))&&(tag.equiv)&&(tag.content)){
				if(!strcasecmp(tag.equiv,"keywords")){
					strncpy(keywords,tag.content,sizeof(keywords));
					keywords[sizeof(keywords)-1]=0;
					e=strtok(tag.content,SEPARATOR);
					while(e){
						if(index&&keywordweight)AddWord(e,keywordweight,1);
						e=strtok(NULL,SEPARATOR);
					}
				}else
				if(!strcasecmp(tag.equiv,"description")){
					strncpy(descript,tag.content,sizeof(descript));
					descript[sizeof(descript)-1]=0;
					e=strtok(tag.content,SEPARATOR);
					while(e){
						if(index&&descweight)AddWord(e,descweight,1);
						e=strtok(NULL,SEPARATOR);
					}
				}else
				if(!strcasecmp(tag.equiv,"robots")){
					if(userobots){
					e=strtok(tag.content,SEPARATOR);
					while(e){
						if(!strcasecmp(e,"ALL")){
							follow=1;index=1;
						}else
						if(!strcasecmp(e,"NONE")){
							follow=0;index=0;
						}else
						if(!strcasecmp(e,"NOINDEX"))
							index=0;
						else
						if(!strcasecmp(e,"NOFOLLOW"))
							follow=0;
						else
						if(!strcasecmp(e,"INDEX"))
							index=1;
						else
						if(!strcasecmp(e,"FOLLOW"))
							follow=1;
						e=strtok(NULL,SEPARATOR);
					}}
					
				}
			}
			if((href)&&(hops<maxhops)&&(follow)&&(gfollow)){
				if(!parseurl(&newurl,href)){
					sprintf(str,"%s://%s%s%s",
					strcmp(newurl.schema,"")?newurl.schema:from.schema,
					strcmp(newurl.hostinfo,"")?newurl.hostinfo:from.hostinfo,
					strcmp(newurl.path,"")?newurl.path:from.path,
					newurl.filename);
					remove2dot(str);
					if(!STRNCMP(str,"ftp://")&&(strstr(str,";type=")))
						*(strstr(str,";type"))=0;

					if((WhatToDo(str)!=F_DISALLOW)&&((FindRobots(str)<0)||(!userobots))){
						if(FindServer(str)>=0)
							AddURL(str,url_id,hops+1);
					}else{
					}
					freeurl(&newurl);
				}else{
				}
				free_it(href);
			}
			
			freetag(&tag);free(tmp);
			s=s+len; state=HTML_UNK;
			break;

		case HTML_TXT: /* text */
			e=strchr(s,'<');
			if(e) len=e-s;
			else  len=strlen(s);
			tmp=(char*)malloc(2*len+6);
			e=tmp;j=0;*e=0;
			for(i=0;i<len;i++){
				switch(s[i]){
				case ' ' : case '\t':
				case '\n': case '\r':
					j++; break;

				case '&': /* parse specials */
					if(j){*e=' ';e++;j=0;}
					i++;
					if(sch=SgmlToChar(s+i)){
						if(sch==' ')j++;
						*e=sch;e++;*e=0;
						while((i<len)&&(isalnum(*(s+i+1))))
							i++;
						if(s[i+1]==';')i++;
					}else{
						*e='?';e++;*e=0;
					}
					break;
				default:
					if(j){*e=' ';e++;j=0;}
					*e=(s[i]=='\'')?'`':s[i];
					e++; *e=0;
					break;
				}
			}

			if((inbody&&!inscript)&&(strlen(text)<MAXTEXTSIZE)){
				if(*text)strcat(text," ");
				strncat(text,tmp,MAXTEXTSIZE);
			}

			if((intitle)&&(strlen(title)<MAXTITLESIZE)){
				if(*title)strcat(title," ");
				strncat(title,tmp,MAXTITLESIZE);
				title[sizeof(title)-1]=0;
			}

			if(index){
				e=strtok(tmp,SEPARATOR);
				while(e){
					if(inbody&&bodyweight&&!inscript)
						AddWord(e,bodyweight,1);
					if(intitle&&titleweight)
						AddWord(e,titleweight,1);
					e=strtok(NULL,SEPARATOR);
				}
			}
			free(tmp);
			s=s+len; state=HTML_UNK;
			break;

		}} /* switch and while */
	}else{
		index=0;
		MD5_DIGEST(content,realsize,digest);
		changed=strcmp(digest,crc);
		if((origin=FindOrigin(digest))&&(origin!=url_id))clone=1;
	}

	if((urlweight)&&(strcmp(from.filename,"robots.txt"))){
		s=strtok(cururl,SEPARATOR);
		while(s){
			AddWord(s,urlweight,1);
			s=strtok(NULL,SEPARATOR);
		}
	}

	if(clone){
		DeleteWordFromURL(url_id);
		udm_log(EXTRA,"Duplicate document");

		sprintf(qbuf,
"UPDATE url \
SET crc='%s',status=%d,\
next_index_time=from_unixtime(unix_timestamp()+%d), \
content_type='%s',last_modified='%s' \
WHERE rec_id=%d",
digest,status,period,content_type,last_modified_escaped,url_id);

		if(safe_mysql_query(&mysql,qbuf))abort_indexer();
		freeurl(&from);
		FreeWords();
		continue;

	}else{
		if((!changed)&&(!reindex)){
			UpdateUrl(url_id,status,period);
			freeurl(&from);
			FreeWords();
			continue;
		}
	}

	StoreWords(url_id);
	FreeWords();

	/* Prepare text,keywords,description */
	s=text;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
	s=keywords;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
	s=descript;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
	s=title;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
	mysql_escape_string(text_escaped,text,strlen(text));
	mysql_escape_string(keywords_escaped,keywords,strlen(keywords));
	mysql_escape_string(descript_escaped,descript,strlen(descript));
	mysql_escape_string(title_escaped,title,strlen(title));

	/* Now update URL */
	sprintf(qbuf,"\
UPDATE url SET \
status='%d',last_modified='%s',last_index_time=%s,\
next_index_time=from_unixtime(unix_timestamp()+%d),\
tag=%d,text='%s',title='%s',content_type='%s',size=%d,\
indexed=%d,keywords='%s',description='%s',crc='%s' \
WHERE rec_id=%d",

	status,last_modified_escaped,changed?"NOW()":"last_index_time",
	period,hint,text_escaped,title,content_type,size,index,
	keywords_escaped,descript_escaped,digest,url_id);

	if(safe_mysql_query(&mysql,qbuf))abort_indexer();
	freeurl(&from);

	if(seconds){
		udm_log(DEBUG,"Sleeping %d second(s)",seconds);
		sleep(seconds);
	}

} /* endwhile */
} /* end of program */
