/* widechar.c - handle multibyte and UTF-8 encoding
   Copyright (C) 1996-2000 Paul Sheer

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.
 */

#include <config.h>
#include <edit.h>

#define MB_MARKER_DENSITY 64

/*
     1 |    7 | 0vvvvvvv
     2 |   11 | 110vvvvv 10vvvvvv
     3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
     4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
     5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
     6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv

*/

/* force utf-8 only multibyte encoding - i.e. ignore locale settings */
int option_utf_interpretation = 0;

unsigned char *wcrtomb_ucs4_to_utf8 (wchar_t c)
{
    static unsigned char r[32];
    int i = 0;
#undef APPEND
#define APPEND(x) r[i++] = (unsigned char) (x)
    if (c < (1 << 7)) {
	APPEND (c);
    } else if (c < (1 << 11)) {
	APPEND ((c >> 6) | 0xC0);
	APPEND ((c & 0x3F) | 0x80);
    } else if (c < (1 << 16)) {
	APPEND ((c >> 12) | 0xE0);
	APPEND (((c >> 6) & 0x3F) | 0x80);
	APPEND ((c & 0x3F) | 0x80);
    } else if (c < (1 << 21)) {
	APPEND ((c >> 18) | 0xE0);
	APPEND (((c >> 12) & 0x3F) | 0x80);
	APPEND (((c >> 6) & 0x3F) | 0x80);
	APPEND ((c & 0x3F) | 0x80);
    }
    APPEND ('\0');
    return r;
}


/* makes sense to me... (although only goes to 21 bits) */
static inline int mbrtowc_utf8_to_ucs4 (wchar_t * c, char *t, int n, void *x /* no shifting with utf8 */ )
{
    unsigned char *s = (unsigned char *) t;
    if (!*s) {
	*c = 0;
	return 0;
    }
    if (*s < 0x80) {
	*c = (wchar_t) * s;
	return 1;
    }
    if (*s < 0xC0)
	return -1;
    if (*s < 0xE0) {
	if (n < 2)
	    return -2;
	if ((s[1] & 0xC0) != 0x80)
	    return -1;
	*c = ((wchar_t) (s[0] & 0x1F) << 6) | (wchar_t) (s[1] & 0x3F);
	if (*c < (1 << 7))
	    return -1;
	return 2;
    }
    if (*s < 0xF0) {
	if (n < 3)
	    return -2;
	if ((s[1] & 0xC0) != 0x80)
	    return -1;
	if ((s[2] & 0xC0) != 0x80)
	    return -1;
	*c = ((wchar_t) (s[0] & 0x0F) << 12) | ((wchar_t) (s[1] & 0x3F) << 6) | (wchar_t) (s[2] & 0x3F);
	if (*c < (1 << 11))
	    return -1;
	return 3;
    }
    if (*s < 0xF8) {
	if (n < 4)
	    return -2;
	if ((s[1] & 0xC0) != 0x80)
	    return -1;
	if ((s[2] & 0xC0) != 0x80)
	    return -1;
	if ((s[3] & 0xC0) != 0x80)
	    return -1;
	*c =
	    ((wchar_t) (s[0] & 0x07) << 18) |
	    ((wchar_t) (s[1] & 0x3F) << 12) | ((wchar_t) (s[2] & 0x3F) << 6) | (wchar_t) (s[3] & 0x3F);
	if (*c < (1 << 16))
	    return -1;
	return 4;
    }
    if (*s < 0xFC) {
	if (n < 5)
	    return -2;
	if ((s[1] & 0xC0) != 0x80)
	    return -1;
	if ((s[2] & 0xC0) != 0x80)
	    return -1;
	if ((s[3] & 0xC0) != 0x80)
	    return -1;
	if ((s[4] & 0xC0) != 0x80)
	    return -1;
	*c =
	    ((wchar_t) (s[0] & 0x03) << 24) | ((wchar_t) (s[1] & 0x3F) << 18) |
	    ((wchar_t) (s[2] & 0x3F) << 12) | ((wchar_t) (s[3] & 0x3F) << 6) | (wchar_t) (s[4] & 0x3F);
	if (*c < (1 << 21))
	    return -1;
	return 5;
    }
    if (*s < 0xFE) {
	if (n < 6)
	    return -2;
	if ((s[1] & 0xC0) != 0x80)
	    return -1;
	if ((s[2] & 0xC0) != 0x80)
	    return -1;
	if ((s[3] & 0xC0) != 0x80)
	    return -1;
	if ((s[4] & 0xC0) != 0x80)
	    return -1;
	if ((s[5] & 0xC0) != 0x80)
	    return -1;
	*c =
	    ((wchar_t) (s[0] & 0x01) << 30) | ((wchar_t) (s[1] & 0x3F) << 24) | ((wchar_t) (s[2] & 0x3F) << 18) |
	    ((wchar_t) (s[3] & 0x3F) << 12) | ((wchar_t) (s[4] & 0x3F) << 6) | (wchar_t) (s[5] & 0x3F);
	if (*c < (1 << 26))
	    return -1;
	return 6;
    }
    return -1;
}

#if 0
/* last arg is len of t to convert NOT len of c */
int mbstowcs_utf8_to_ucs4 (wchar_t * c, char *t, int n)
{
    int v = 0;
    while (n) {
	int r;
	if ((r = mbrtowc_utf8_to_ucs4 (c, t, n, 0)) == -1) {
	    *c++ = *t++;
	    v++;
	    n--;
	} else if (r == -2) {
	    break;
	} else {
	    t += r;
	    n -= r;
	    v++;
	    c++;
	}
    }
    return v;
}

wchar_t *mbstowcs_dup (unsigned char *s)
{
    wchar_t *t;
    t = CMalloc ((strlen ((char *) s) + 1) * sizeof (wchar_t));
    t[mbstowcs_utf8_to_ucs4 (t, (char *) s, strlen ((char *) s))] = 0;
    return t;
}

int wchar_t_strlen (wchar_t * p)
{
    int v;
    for (v = 0; *p; p++, v++);
    return v;
}
#endif

static inline struct mb_rule apply_mb_rules_going_right_utf8_to_ucs4 (WEdit * edit, long byte_index,
								      struct mb_rule mb_rule)
{
    wchar_t wc;
    unsigned char p[16];
    int n;
    if (mb_rule.end) {
	mb_rule.end--;
	mb_rule.ch = -1;
	return mb_rule;
    }
    for (n = 0; n < 6; n++) {
	int r;
	p[n] = edit_get_byte (edit, byte_index + n);
	r = mbrtowc_utf8_to_ucs4 (&wc, (char *) p, n + 1, &mb_rule.shift_state);
	if (r >= 0) {
	    mb_rule.end = n;
	    mb_rule.ch = wc;
	    return mb_rule;
	}
	if (r == -1) {
	    mb_rule.end = 0;
	    mb_rule.ch = (unsigned long) *p | 0x80000000;
	    return mb_rule;
	}
    }
    mb_rule.end = 0;
    mb_rule.ch = -1;
    return mb_rule;
}

static inline struct mb_rule apply_mb_rules_going_right (WEdit * edit, long byte_index, struct mb_rule mb_rule)
{
#ifdef HAVE_WCHAR_H
    wchar_t wc;
    unsigned char p[16];
    int n;
    if (mb_rule.end) {
	mb_rule.end--;
	mb_rule.ch = -1;
	return mb_rule;
    }
    for (n = 0; n < MB_CUR_MAX; n++) {
	int r;
	p[n] = edit_get_byte (edit, byte_index + n);
	r = mbrtowc (&wc, (char *) p, n + 1, &mb_rule.shift_state);
	if (r >= 0) {
	    mb_rule.end = n;
	    mb_rule.ch = wc;
	    return mb_rule;
	}
	if (r == -1) {
	    mb_rule.end = 0;
	    mb_rule.ch = *p;
	    return mb_rule;
	}
    }
    mb_rule.end = 0;
    mb_rule.ch = -1;
#endif
    return mb_rule;
}

struct mb_rule get_mb_rule (WEdit * edit, long byte_index)
{
    long i;
    if (
#ifndef HAVE_WCHAR_H
	   !option_utf_interpretation ||
#endif
	   (MB_CUR_MAX == 1 && !option_utf_interpretation)) {
	struct mb_rule r;
	r.end = 0;
	r.ch = edit_get_byte (edit, byte_index);
	return r;
    }
    if (edit->mb_invalidate) {
	struct _mb_marker *s;
	while (edit->mb_marker && edit->mb_marker->offset >= edit->last_get_mb_rule) {
	    s = edit->mb_marker->next;
	    free (edit->mb_marker);
	    edit->mb_marker = s;
	}
	if (edit->mb_marker) {
	    edit->last_get_mb_rule = edit->mb_marker->offset;
	    edit->mb_rule = edit->mb_marker->rule;
	} else {
	    edit->last_get_mb_rule = -1;
	    memset (&edit->mb_rule, 0, sizeof (edit->mb_rule));
	}
	edit->mb_invalidate = 0;
    }
    if (byte_index > edit->last_get_mb_rule) {
	if (option_utf_interpretation) {
	    for (i = edit->last_get_mb_rule + 1; i <= byte_index; i++) {
		edit->mb_rule = apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule);
		if (i >
		    (edit->mb_marker ? edit->mb_marker->offset +
		     MB_MARKER_DENSITY : MB_MARKER_DENSITY)) {
		    struct _mb_marker *s;
		    s = edit->mb_marker;
		    edit->mb_marker = malloc (sizeof (struct _mb_marker));
		    edit->mb_marker->next = s;
		    edit->mb_marker->offset = i;
		    edit->mb_marker->rule = edit->mb_rule;
		}
	    }
	} else {
	    for (i = edit->last_get_mb_rule + 1; i <= byte_index; i++) {
		edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule);
		if (i >
		    (edit->mb_marker ? edit->mb_marker->offset +
		     MB_MARKER_DENSITY : MB_MARKER_DENSITY)) {
		    struct _mb_marker *s;
		    s = edit->mb_marker;
		    edit->mb_marker = malloc (sizeof (struct _mb_marker));
		    edit->mb_marker->next = s;
		    edit->mb_marker->offset = i;
		    edit->mb_marker->rule = edit->mb_rule;
		}
	    }
	}
    } else if (byte_index < edit->last_get_mb_rule) {
	struct _mb_marker *s;
	for (;;) {
	    if (!edit->mb_marker) {
		memset (&edit->mb_rule, 0, sizeof (edit->mb_rule));
		if (option_utf_interpretation) {
		    for (i = -1; i <= byte_index; i++)
			edit->mb_rule =
			    apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule);
		} else {
		    for (i = -1; i <= byte_index; i++)
			edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule);
		}
		break;
	    }
	    if (byte_index >= edit->mb_marker->offset) {
		edit->mb_rule = edit->mb_marker->rule;
		if (option_utf_interpretation) {
		    for (i = edit->mb_marker->offset + 1; i <= byte_index; i++)
			edit->mb_rule =
			    apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule);
		} else {
		    for (i = edit->mb_marker->offset + 1; i <= byte_index; i++)
			edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule);
		}
		break;
	    }
	    s = edit->mb_marker->next;
	    free (edit->mb_marker);
	    edit->mb_marker = s;
	}
    }
    edit->last_get_mb_rule = byte_index;
    return edit->mb_rule;
}

long edit_get_wide_byte (WEdit * edit, long byte_index)
{
    struct mb_rule r;
    r = get_mb_rule (edit, byte_index);
    return r.ch;
}

