001package squidpony;
002
003import regexodus.Category;
004import regexodus.MatchResult;
005import regexodus.Pattern;
006import regexodus.Replacer;
007import regexodus.Substitution;
008import regexodus.TextBuffer;
009import squidpony.squidmath.CrossHash;
010import squidpony.squidmath.StatefulRNG;
011
012import java.io.Serializable;
013import java.util.HashMap;
014import java.util.Map;
015
016/**
017 * Class that builds up a dictionary of words in a source text to words generated by a FakeLanguageGen, and can
018 * "translate" a source text to a similarly-punctuated, similarly-capitalized fake text. Uses a hash of each word in the
019 * source text to determine the RNG seed that FakeLanguageGen will use, so the translation is not random. Can cipher a
020 * typically English text and generate a text with FakeLanguageGen, but also decipher such a generated text with a
021 * fully-complete, partially-complete, or partially-incorrect vocabulary.
022 * <br>
023 * This defaults to caching source-language words to their generated-language word translations in the field table, as
024 * well as the reverse translation in reverse. This can be changed to reduce memory usage for large vocabularies with
025 * {@code setCacheLevel()}, where it starts at 2 (writing to table and reverse), and can be lowered to 1 (writing to
026 * table only) if you don't need reverse to decipher a language easily, or to 0 (writing to neither) if you expect that
027 * memory will be at a premium and don't mind re-generating the same word each time it occurs in a source text. If
028 * cacheLevel is 1 or less, then this will not check for overlap between previously-generated words (it won't have an
029 * easy way to look up previously-generated ones) and so may be impossible to accurately decipher. As an example, one
030 * test of level 1 generated "he" as the translation for both "a" and "at", so every time "a" had been ciphered and then
031 * deciphered, the reproduced version said "at" instead. This won't happen by default, but the default instead relies on
032 * words being entered as inputs to cipher() or lookup() in the same order. If words are entered in two different orders
033 * to different runs of the program, they may have different generated results if cacheLevel is 2. One way to handle
034 * this is to use cacheLevel 2 and cipher the whole game script, or just the unique words in it (maybe just a large word
035 * list, such as http://wordlist.aspell.net/12dicts/ ), then serialize the LanguageCipher for later usage.
036 * @author Tommy Ettinger
037 * Created by Tommy Ettinger on 5/1/2016.
038 */
039public class LanguageCipher implements Serializable{
040    private static final long serialVersionUID = 1287835632461186341L;
041    /**
042     * The FakeLanguageGen this will use to construct words; normally one of the static fields in FakeLanguageGen or a
043     * FakeLanguageGen produced by using the mix() method of one of them. Manually constructing FakeLanguageGen objects
044     * isn't especially easy, and if you decide to do that it's recommended you look at SquidLib's source to see how the
045     * existing calls to constructors work.
046     */
047    public FakeLanguageGen language;
048    private StatefulRNG rng;
049    // not a LinkedHashMap because this should never be need a random element to be requested
050    /**
051     * The mapping of lower-case word keys to lower-case word values, where keys are in the source language and values
052     * are generated by language.
053     */
054    public HashMap<String, String> table,
055    /**
056     * The mapping of lower-case word keys to lower-case word values, where keys are generated by language and values
057     * are in the source language. Can be used as a complete vocabulary when passed to decipher.
058     */
059    reverse;
060    private static final Pattern wordMatch = Pattern.compile("(\\pL+)|(\\pL[\\pL-]*\\pL)");
061
062    /**
063     * The degree of vocabulary to cache to speed up future searches at the expense of memory usage.
064     * <ul>
065     * <li>2 will cache source words to generated words in table, and generated to source in reverse.</li>
066     * <li>1 will cache source words to generated words in table, and won't write to reverse.</li>
067     * <li>0 won't write to table or reverse.</li>
068     * </ul>
069     * Defaults to 2, writing to both table and reverse.
070     */
071    public int cacheLevel = 2;
072
073    /**
074     * Constructs a LanguageCipher that will generate English-like or Dutch-like text by default.
075     */
076    public LanguageCipher()
077    {
078        this(FakeLanguageGen.ENGLISH);
079    }
080
081    /**
082     * Constructs a LanguageCipher that will use the given style of language generator to produce its text.
083     * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them.
084     */
085    public LanguageCipher(FakeLanguageGen language)
086    {
087        this.language = language.copy();
088        rng = new StatefulRNG();
089        table = new HashMap<>(512);
090        reverse = new HashMap<>(512);
091    }
092
093    /**
094     * Copies another LanguageCipher and constructs this one with the information in the other. Copies the dictionary
095     * of known words, as well as the FakeLanguageGen style and everything else.
096     * @param other a previously-constructed LanguageCipher.
097     */
098    public LanguageCipher(LanguageCipher other)
099    {
100        this.language = other.language.copy();
101        this.rng = new StatefulRNG();
102        this.table = new HashMap<>(other.table);
103        this.reverse = new HashMap<>(other.reverse);
104    }
105
106    /**
107     * Given a word in the source language (usually English), looks up an existing translation for that word, or if none
108     * exists, generates a new word based on the hash of the source word and this LanguageCipher's FakeLanguageGen.
109     * @param source a word in the source language
110     * @return a word in the fake language
111     */
112    public String lookup(String source)
113    {
114        if(source == null || source.isEmpty())
115            return "";
116        String s2 = source.toLowerCase(), ciphered;
117        if(table.containsKey(s2))
118            ciphered = table.get(s2);
119        else {
120            long h = CrossHash.hash64(s2), frustration = 0;
121            rng.setState(h);
122            do {
123                ciphered = language.word(rng, false, (int) Math.ceil(s2.length() / (2.2 + rng.nextDouble())));
124                if(cacheLevel < 2 || frustration++ > 9)
125                    break;
126            }while (reverse.containsKey(ciphered));
127            switch (cacheLevel) {
128                case 2: reverse.put(ciphered, s2);
129                case 1: table.put(s2, ciphered);
130            }
131        }
132        char[] chars = ciphered.toCharArray();
133        // Lu is the upper case letter category in Unicode; we're using regexodus for this because GWT probably
134        // won't respect unicode case data on its own. We are using GWT to capitalize, though. Hope it works...
135        if(Category.Lu.contains(source.charAt(0)))
136            chars[0] = Character.toUpperCase(chars[0]);
137        if(source.length() > 1 && Category.Lu.contains(source.charAt(1))) {
138            for (int i = 1; i < chars.length; i++) {
139                chars[i] = Character.toUpperCase(chars[i]);
140            }
141        }
142        return new String(chars);
143    }
144
145    /**
146     * Given a String, StringBuilder, or other CharSequence that should contain words in the source language, this
147     * translates each word to the fake language, using existing translations if previous calls to cipher() or lookup()
148     * had translated that word.
149     * @param text a CharSequence, such as a String, that contains words in the source language
150     * @return a String of the translated text.
151     */
152    public String cipher(CharSequence text)
153    {
154        Replacer rep = wordMatch.replacer(new CipherSubstitution());
155        return rep.replace(text);
156    }
157
158    private class CipherSubstitution implements Substitution
159    {
160        @Override
161        public void appendSubstitution(MatchResult match, TextBuffer dest) {
162            dest.append(lookup(match.group(0)));
163        }
164    }
165    private class DecipherSubstition implements Substitution
166    {
167        private final Map<String, String> vocabulary;
168        DecipherSubstition(final Map<String, String> vocabulary)
169        {
170            this.vocabulary = vocabulary;
171        }
172        public void appendSubstitution(MatchResult match, TextBuffer dest) {
173            String translated = match.group(0);
174            if(translated == null) {
175                return;
176            }
177            translated = translated.toLowerCase();
178            translated = vocabulary.get(translated);
179            if(translated == null) {
180                dest.append(match.group(0));
181                return;
182            }
183            char[] chars = translated.toCharArray();
184            if(Category.Lu.contains(match.charAt(0)))
185                chars[0] = Character.toUpperCase(chars[0]);
186            if(match.length() > 1 && Category.Lu.contains(match.charAt(1))) {
187                for (int i = 1; i < chars.length; i++) {
188                    chars[i] = Character.toUpperCase(chars[i]);
189                }
190            }
191            dest.append(chars, 0, chars.length);
192        }
193    }
194
195    /**
196     * Deciphers words in an already-ciphered text with a given String-to-String Map for a vocabulary. This Map could be
197     * the reverse field of this LanguageCipher, which would give a complete translation, or it could be a
198     * partially-complete or partially-correct vocabulary of words the player has learned. The vocabulary should
199     * typically have entries added using the quick and accurate learnTranslations() method, unless you want to add
200     * translations one word at a time (then use learnTranslation() ) or you want incorrect or biased translations added
201     * (then use mismatchTranslation() ). You don't need to use one of these methods if you just pass the whole of the
202     * reverse field as a vocabulary, which will translate every word. If making your own vocabulary without the learn
203     * methods, the keys need to be lower-case because while regex Patterns can be case-insensitive, Map lookups cannot.
204     * @param text a text in the fake language
205     * @param vocabulary a Map of Strings in the fake language to Strings in the source language
206     * @return a deciphered version of text that has any words as keys in vocabulary translated to the source language
207     */
208    public String decipher(String text, final Map<String, String> vocabulary)
209    {
210        Pattern pat;
211        Replacer rep;
212        StringBuilder sb = new StringBuilder(128);
213        sb.append("(?:");
214        for(String k : vocabulary.keySet())
215        {
216            sb.append("(?:\\Q");
217            sb.append(k);
218            sb.append("\\E)|");
219        }
220        sb.deleteCharAt(sb.length() - 1);
221        sb.append(')');
222
223        pat = Pattern.compile("\\b" + sb + "\\b", "ui");
224
225        rep = pat.replacer(new DecipherSubstition(vocabulary));
226        return rep.replace(text);
227    }
228
229    /**
230     * Adds a translation pair to vocabulary so it can be used in decipher, giving a correct translation for sourceWord.
231     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct a mismatched
232     * translation added to vocabulary with mismatchTranslation.
233     * @param vocabulary a Map of String keys to String values that will be modified in-place
234     * @param sourceWord a word in the source language, typically English; the meaning will be "learned" for decipher
235     * @return this, for chaining
236     */
237    public LanguageCipher learnTranslation(Map<String, String> vocabulary, String sourceWord)
238    {
239        vocabulary.put(lookup(sourceWord.toLowerCase()), sourceWord);
240        return this;
241    }
242
243    /**
244     * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
245     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
246     * translations added to vocabulary with mismatchTranslation.
247     * @param vocabulary a Map of String keys to String values that will be modified in-place
248     * @param sourceWords an array or vararg of words in the source language, typically English; their meanings will
249     *                    be "learned" for decipher
250     * @return this, for chaining
251     */
252    public LanguageCipher learnTranslations(Map<String, String> vocabulary, String... sourceWords)
253    {
254        for (int i = 0; i < sourceWords.length; i++) {
255            learnTranslation(vocabulary, sourceWords[i]);
256        }
257        return this;
258    }
259
260    /**
261     * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
262     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
263     * translations added to vocabulary with mismatchTranslation.
264     * @param vocabulary a Map of String keys to String values that will be modified in-place
265     * @param sourceWords an Iterable of words in the source language, typically English; their meanings will be
266     *                   "learned" for decipher
267     * @return this, for chaining
268     */
269    public LanguageCipher learnTranslations(Map<String, String> vocabulary, Iterable<String> sourceWords)
270    {
271        for (String s : sourceWords) {
272            learnTranslation(vocabulary, s);
273        }
274        return this;
275    }
276
277    /**
278     * Adds a translation pair to vocabulary so it can be used in decipher, giving a typically-incorrect translation for
279     * correctWord where it provides mismatchWord instead when the ciphered version of correctWord appears.
280     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. You can use learnTranslation() to
281     * correct a mismatched vocabulary word, or mismatchTranslation() again to change the mismatched word.
282     * @param vocabulary a Map of String keys to String values that will be modified in-place
283     * @param correctWord a word in the source language, typically English; where the ciphered version of this
284     *                    appears and the text is deciphered, mismatchWord will be used instead
285     * @param mismatchWord a String that will be used for deciphering in place of the translation of correctWord.
286     * @return this, for chaining
287     */
288    public LanguageCipher mismatchTranslation(Map<String, String> vocabulary, String correctWord, String mismatchWord)
289    {
290        vocabulary.put(lookup(correctWord.toLowerCase()), mismatchWord);
291        return this;
292    }
293
294    public int getCacheLevel() {
295        return cacheLevel;
296    }
297
298    public void setCacheLevel(int cacheLevel) {
299        if(cacheLevel >= 2) this.cacheLevel = 2;
300        else if(cacheLevel <= 0) this.cacheLevel = 0;
301        else this.cacheLevel = cacheLevel;
302    }
303}