001package squidpony; 002 003import regexodus.Category; 004import regexodus.MatchResult; 005import regexodus.Pattern; 006import regexodus.Replacer; 007import regexodus.Substitution; 008import regexodus.TextBuffer; 009import squidpony.squidmath.CrossHash; 010import squidpony.squidmath.StatefulRNG; 011 012import java.io.Serializable; 013import java.util.HashMap; 014import java.util.Map; 015 016/** 017 * Class that builds up a dictionary of words in a source text to words generated by a FakeLanguageGen, and can 018 * "translate" a source text to a similarly-punctuated, similarly-capitalized fake text. Uses a hash of each word in the 019 * source text to determine the RNG seed that FakeLanguageGen will use, so the translation is not random. Can cipher a 020 * typically English text and generate a text with FakeLanguageGen, but also decipher such a generated text with a 021 * fully-complete, partially-complete, or partially-incorrect vocabulary. 022 * <br> 023 * This defaults to caching source-language words to their generated-language word translations in the field table, as 024 * well as the reverse translation in reverse. This can be changed to reduce memory usage for large vocabularies with 025 * {@code setCacheLevel()}, where it starts at 2 (writing to table and reverse), and can be lowered to 1 (writing to 026 * table only) if you don't need reverse to decipher a language easily, or to 0 (writing to neither) if you expect that 027 * memory will be at a premium and don't mind re-generating the same word each time it occurs in a source text. If 028 * cacheLevel is 1 or less, then this will not check for overlap between previously-generated words (it won't have an 029 * easy way to look up previously-generated ones) and so may be impossible to accurately decipher. As an example, one 030 * test of level 1 generated "he" as the translation for both "a" and "at", so every time "a" had been ciphered and then 031 * deciphered, the reproduced version said "at" instead. This won't happen by default, but the default instead relies on 032 * words being entered as inputs to cipher() or lookup() in the same order. If words are entered in two different orders 033 * to different runs of the program, they may have different generated results if cacheLevel is 2. One way to handle 034 * this is to use cacheLevel 2 and cipher the whole game script, or just the unique words in it (maybe just a large word 035 * list, such as http://wordlist.aspell.net/12dicts/ ), then serialize the LanguageCipher for later usage. 036 * @author Tommy Ettinger 037 * Created by Tommy Ettinger on 5/1/2016. 038 */ 039public class LanguageCipher implements Serializable{ 040 private static final long serialVersionUID = 1287835632461186341L; 041 /** 042 * The FakeLanguageGen this will use to construct words; normally one of the static fields in FakeLanguageGen or a 043 * FakeLanguageGen produced by using the mix() method of one of them. Manually constructing FakeLanguageGen objects 044 * isn't especially easy, and if you decide to do that it's recommended you look at SquidLib's source to see how the 045 * existing calls to constructors work. 046 */ 047 public FakeLanguageGen language; 048 private StatefulRNG rng; 049 // not a LinkedHashMap because this should never be need a random element to be requested 050 /** 051 * The mapping of lower-case word keys to lower-case word values, where keys are in the source language and values 052 * are generated by language. 053 */ 054 public HashMap<String, String> table, 055 /** 056 * The mapping of lower-case word keys to lower-case word values, where keys are generated by language and values 057 * are in the source language. Can be used as a complete vocabulary when passed to decipher. 058 */ 059 reverse; 060 private static final Pattern wordMatch = Pattern.compile("(\\pL+)|(\\pL[\\pL-]*\\pL)"); 061 062 /** 063 * The degree of vocabulary to cache to speed up future searches at the expense of memory usage. 064 * <ul> 065 * <li>2 will cache source words to generated words in table, and generated to source in reverse.</li> 066 * <li>1 will cache source words to generated words in table, and won't write to reverse.</li> 067 * <li>0 won't write to table or reverse.</li> 068 * </ul> 069 * Defaults to 2, writing to both table and reverse. 070 */ 071 public int cacheLevel = 2; 072 073 /** 074 * Constructs a LanguageCipher that will generate English-like or Dutch-like text by default. 075 */ 076 public LanguageCipher() 077 { 078 this(FakeLanguageGen.ENGLISH); 079 } 080 081 /** 082 * Constructs a LanguageCipher that will use the given style of language generator to produce its text. 083 * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them. 084 */ 085 public LanguageCipher(FakeLanguageGen language) 086 { 087 this.language = language.copy(); 088 rng = new StatefulRNG(); 089 table = new HashMap<>(512); 090 reverse = new HashMap<>(512); 091 } 092 093 /** 094 * Copies another LanguageCipher and constructs this one with the information in the other. Copies the dictionary 095 * of known words, as well as the FakeLanguageGen style and everything else. 096 * @param other a previously-constructed LanguageCipher. 097 */ 098 public LanguageCipher(LanguageCipher other) 099 { 100 this.language = other.language.copy(); 101 this.rng = new StatefulRNG(); 102 this.table = new HashMap<>(other.table); 103 this.reverse = new HashMap<>(other.reverse); 104 } 105 106 /** 107 * Given a word in the source language (usually English), looks up an existing translation for that word, or if none 108 * exists, generates a new word based on the hash of the source word and this LanguageCipher's FakeLanguageGen. 109 * @param source a word in the source language 110 * @return a word in the fake language 111 */ 112 public String lookup(String source) 113 { 114 if(source == null || source.isEmpty()) 115 return ""; 116 String s2 = source.toLowerCase(), ciphered; 117 if(table.containsKey(s2)) 118 ciphered = table.get(s2); 119 else { 120 long h = CrossHash.hash64(s2), frustration = 0; 121 rng.setState(h); 122 do { 123 ciphered = language.word(rng, false, (int) Math.ceil(s2.length() / (2.2 + rng.nextDouble()))); 124 if(cacheLevel < 2 || frustration++ > 9) 125 break; 126 }while (reverse.containsKey(ciphered)); 127 switch (cacheLevel) { 128 case 2: reverse.put(ciphered, s2); 129 case 1: table.put(s2, ciphered); 130 } 131 } 132 char[] chars = ciphered.toCharArray(); 133 // Lu is the upper case letter category in Unicode; we're using regexodus for this because GWT probably 134 // won't respect unicode case data on its own. We are using GWT to capitalize, though. Hope it works... 135 if(Category.Lu.contains(source.charAt(0))) 136 chars[0] = Character.toUpperCase(chars[0]); 137 if(source.length() > 1 && Category.Lu.contains(source.charAt(1))) { 138 for (int i = 1; i < chars.length; i++) { 139 chars[i] = Character.toUpperCase(chars[i]); 140 } 141 } 142 return new String(chars); 143 } 144 145 /** 146 * Given a String, StringBuilder, or other CharSequence that should contain words in the source language, this 147 * translates each word to the fake language, using existing translations if previous calls to cipher() or lookup() 148 * had translated that word. 149 * @param text a CharSequence, such as a String, that contains words in the source language 150 * @return a String of the translated text. 151 */ 152 public String cipher(CharSequence text) 153 { 154 Replacer rep = wordMatch.replacer(new CipherSubstitution()); 155 return rep.replace(text); 156 } 157 158 private class CipherSubstitution implements Substitution 159 { 160 @Override 161 public void appendSubstitution(MatchResult match, TextBuffer dest) { 162 dest.append(lookup(match.group(0))); 163 } 164 } 165 private class DecipherSubstition implements Substitution 166 { 167 private final Map<String, String> vocabulary; 168 DecipherSubstition(final Map<String, String> vocabulary) 169 { 170 this.vocabulary = vocabulary; 171 } 172 public void appendSubstitution(MatchResult match, TextBuffer dest) { 173 String translated = match.group(0); 174 if(translated == null) { 175 return; 176 } 177 translated = translated.toLowerCase(); 178 translated = vocabulary.get(translated); 179 if(translated == null) { 180 dest.append(match.group(0)); 181 return; 182 } 183 char[] chars = translated.toCharArray(); 184 if(Category.Lu.contains(match.charAt(0))) 185 chars[0] = Character.toUpperCase(chars[0]); 186 if(match.length() > 1 && Category.Lu.contains(match.charAt(1))) { 187 for (int i = 1; i < chars.length; i++) { 188 chars[i] = Character.toUpperCase(chars[i]); 189 } 190 } 191 dest.append(chars, 0, chars.length); 192 } 193 } 194 195 /** 196 * Deciphers words in an already-ciphered text with a given String-to-String Map for a vocabulary. This Map could be 197 * the reverse field of this LanguageCipher, which would give a complete translation, or it could be a 198 * partially-complete or partially-correct vocabulary of words the player has learned. The vocabulary should 199 * typically have entries added using the quick and accurate learnTranslations() method, unless you want to add 200 * translations one word at a time (then use learnTranslation() ) or you want incorrect or biased translations added 201 * (then use mismatchTranslation() ). You don't need to use one of these methods if you just pass the whole of the 202 * reverse field as a vocabulary, which will translate every word. If making your own vocabulary without the learn 203 * methods, the keys need to be lower-case because while regex Patterns can be case-insensitive, Map lookups cannot. 204 * @param text a text in the fake language 205 * @param vocabulary a Map of Strings in the fake language to Strings in the source language 206 * @return a deciphered version of text that has any words as keys in vocabulary translated to the source language 207 */ 208 public String decipher(String text, final Map<String, String> vocabulary) 209 { 210 Pattern pat; 211 Replacer rep; 212 StringBuilder sb = new StringBuilder(128); 213 sb.append("(?:"); 214 for(String k : vocabulary.keySet()) 215 { 216 sb.append("(?:\\Q"); 217 sb.append(k); 218 sb.append("\\E)|"); 219 } 220 sb.deleteCharAt(sb.length() - 1); 221 sb.append(')'); 222 223 pat = Pattern.compile("\\b" + sb + "\\b", "ui"); 224 225 rep = pat.replacer(new DecipherSubstition(vocabulary)); 226 return rep.replace(text); 227 } 228 229 /** 230 * Adds a translation pair to vocabulary so it can be used in decipher, giving a correct translation for sourceWord. 231 * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct a mismatched 232 * translation added to vocabulary with mismatchTranslation. 233 * @param vocabulary a Map of String keys to String values that will be modified in-place 234 * @param sourceWord a word in the source language, typically English; the meaning will be "learned" for decipher 235 * @return this, for chaining 236 */ 237 public LanguageCipher learnTranslation(Map<String, String> vocabulary, String sourceWord) 238 { 239 vocabulary.put(lookup(sourceWord.toLowerCase()), sourceWord); 240 return this; 241 } 242 243 /** 244 * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords. 245 * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched 246 * translations added to vocabulary with mismatchTranslation. 247 * @param vocabulary a Map of String keys to String values that will be modified in-place 248 * @param sourceWords an array or vararg of words in the source language, typically English; their meanings will 249 * be "learned" for decipher 250 * @return this, for chaining 251 */ 252 public LanguageCipher learnTranslations(Map<String, String> vocabulary, String... sourceWords) 253 { 254 for (int i = 0; i < sourceWords.length; i++) { 255 learnTranslation(vocabulary, sourceWords[i]); 256 } 257 return this; 258 } 259 260 /** 261 * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords. 262 * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched 263 * translations added to vocabulary with mismatchTranslation. 264 * @param vocabulary a Map of String keys to String values that will be modified in-place 265 * @param sourceWords an Iterable of words in the source language, typically English; their meanings will be 266 * "learned" for decipher 267 * @return this, for chaining 268 */ 269 public LanguageCipher learnTranslations(Map<String, String> vocabulary, Iterable<String> sourceWords) 270 { 271 for (String s : sourceWords) { 272 learnTranslation(vocabulary, s); 273 } 274 return this; 275 } 276 277 /** 278 * Adds a translation pair to vocabulary so it can be used in decipher, giving a typically-incorrect translation for 279 * correctWord where it provides mismatchWord instead when the ciphered version of correctWord appears. 280 * Modifies vocabulary in-place and returns this LanguageCipher for chaining. You can use learnTranslation() to 281 * correct a mismatched vocabulary word, or mismatchTranslation() again to change the mismatched word. 282 * @param vocabulary a Map of String keys to String values that will be modified in-place 283 * @param correctWord a word in the source language, typically English; where the ciphered version of this 284 * appears and the text is deciphered, mismatchWord will be used instead 285 * @param mismatchWord a String that will be used for deciphering in place of the translation of correctWord. 286 * @return this, for chaining 287 */ 288 public LanguageCipher mismatchTranslation(Map<String, String> vocabulary, String correctWord, String mismatchWord) 289 { 290 vocabulary.put(lookup(correctWord.toLowerCase()), mismatchWord); 291 return this; 292 } 293 294 public int getCacheLevel() { 295 return cacheLevel; 296 } 297 298 public void setCacheLevel(int cacheLevel) { 299 if(cacheLevel >= 2) this.cacheLevel = 2; 300 else if(cacheLevel <= 0) this.cacheLevel = 0; 301 else this.cacheLevel = cacheLevel; 302 } 303}