/**
 * Created by tomnotcat on 2024/2/28.
 */
import WinkTokenizer from 'wink-tokenizer';

class EnglishTokenCodec {
    constructor() {
        this._tokenChar = new Map();
        this._charToken = new Map();
        this._codePoint = 19968;
    }

    encode(text) {
        let tokenizer = WinkTokenizer();
        let tokens = tokenizer.tokenize(text);
        let result = "";
        let cursor = 0;
        // NOTE: 160是&nbsp;
        const spaces = [" ", "\n", "\t", String.fromCodePoint(160)];
        for (let i = 0; i < tokens.length; ++i) {
            let t = tokens[i];
            let pos = text.indexOf(t.value, cursor);
            if (pos >= 0) {
                cursor = pos + t.value.length;
                if (pos > 0) {
                    if (spaces.indexOf(text[pos - 1]) >= 0) {
                        result += ' ';
                    }
                }
            }

            if (t.tag === 'word') {
                result += this.tokenToChar(t.value);
            } else {
                result += t.value;
            }
        }
        return result;
    }

    decodeToArray(text) {
        let result = [];
        for (let i = 0; i < text.length; ++i) {
            let t = this.charToToken(text[i]);
            /*
            if (t !== text[i]) {
                if (result.length > 0) {
                    result += " ";
                }
            }
            */
            result.push({from: text[i], to: t});
        }
        return result;
    }

    decode(text) {
        let result = '';
        let ss = this.decodeToArray(text);
        for (let i = 0; i < ss.length; ++i) {
            result += ss[i].to;
        }
        return result;
    }

    tokenToChar(_t) {
        let t = _t;
        if (t) {
            let c = this._tokenChar.get(t);
            if (!c) {
                this._codePoint += 1;
                if (this._codePoint > 40869) {
                    throw new Error("codePointt too big");
                }
                c = String.fromCodePoint(this._codePoint);
                this._tokenChar.set(t, c);
                this._charToken.set(c, t);
            }
            return c;
        }
        return t;
    }

    charToToken(c) {
        if (c) {
            let t = this._charToken.get(c);
            if (t) {
                return t
            }
        }
        return c;
    }
};

export default EnglishTokenCodec;
