001package Torello.Languages;
002
003import Torello.Java.*;
004import Torello.HTML.*;
005import Torello.Java.Additional.RemoveUnsupportedIterator;
006
007import java.io.*;
008import java.util.*;
009import java.util.regex.*;
010
011/**
012 * <CODE>Verbs (Spanish) - Documentation.</CODE><BR /><BR />
013 * <EMBED CLASS="external-html" DATA-FILE-ID="V">
014 */
015@SuppressWarnings("unchecked")
016public class Verbs
017{
018    private Verbs() { }
019
020    private static String                   conjugations            = null;
021    private static TreeSet<String>          irregularInfinitives    = new TreeSet<>();
022    private static TreeSet<String>          infinitives             = new TreeSet<>();
023    private static TreeMap<String, String>  definitions             = new TreeMap<>();
024
025    private static final String             R                       = "Spanish/";
026    private static final String             CONJUGATIONS            = R + "CONJUGATIONS.tmdat";  // ? '.tmdat' ? not String ?
027    private static final String             IRREG_INFINITIVES       = R + "IRREGULAR_INFINITIVES.tsdat";
028    private static final String             INFINITIVES             = R + "INFINITIVES.tsdat";
029    private static final String             DEFINITIONS             = R + "DEFINITIONS.tmdat";
030    private static final String             DEFINITIONS_JS          = R + "definitions.sdat";
031    private static final String             POPUP_WIN_JS            = R + "verbs.sdat";
032    private static final String             IRREG_CONJ_JS           = R + "ir.sdat";
033    private static final String             POPUP_WIN_CSS           = R + "css.sdat";
034
035    private static Runtime                  rt                      = null;
036
037    private static void GC()
038    { if (rt == null) rt = Runtime.getRuntime(); rt.gc(); }
039
040    /**
041     * Loads the Conjugations String into memory.  This must be in memory before working with
042     * Verb-Spans.
043     * 
044     * @see LFEC#loadFile_JAR(Class, String)
045     */
046    public static void loadConjugations()
047    { conjugations = LFEC.loadFile_JAR(Torello.Data.DataFileLoader.class, CONJUGATIONS); }
048
049    /**
050     * Releases the memory for the (rather large) Java-String containing the verb conjugations.
051     * Calls gc().
052     * 
053     * @see #GC()
054     */
055    public static void releaseConjugations()
056    { conjugations = null; GC(); }
057
058    /**
059     * Loads the list of Irregular Infinitives, from .JAR.  This must be in memory before working
060     * with Verb-Spans.
061     * 
062     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
063     */
064    public static void loadIrregularInfinitives()
065    {
066        irregularInfinitives = (TreeSet<String>) LFEC.readObjectFromFile_JAR
067            (Torello.Data.DataFileLoader.class, IRREG_INFINITIVES, true, TreeSet.class);
068    }
069
070    /**
071     * Releases the memory for the (rather large) TreeSet of Irregular Infinitives.  Calls gc().
072     * 
073     * @see #GC()
074     */
075    public static void releaseIrregularInfinitives()
076    { irregularInfinitives.clear(); irregularInfinitives = null; GC(); }
077
078    /**
079     * Loads the complete list of known infinitives from JAR to the {@code TreeSet<String>}
080     * 
081     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
082     */
083    public static void loadInfinitives()
084    { 
085        infinitives = (TreeSet<String>) LFEC.readObjectFromFile_JAR
086            (Torello.Data.DataFileLoader.class, INFINITIVES, true, TreeSet.class);
087    }
088
089    /**
090     * Releases the memory for the TreeSet of infinitives.  Calls gc().
091     * 
092     * @see #GC()
093     */
094    public static void releaseInfinitives()
095    { infinitives.clear(); infinitives = null; GC(); }
096
097    /**
098     * Loads the definitions file - a {@code TreeMap<String, String>}
099     * 
100     * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
101     */
102    public static void loadDefinitions()
103    { 
104        definitions = (TreeMap<String, String>) LFEC.readObjectFromFile_JAR
105            (Torello.Data.DataFileLoader.class, DEFINITIONS, true, TreeMap.class);
106    }
107
108    /**
109     * Releasees the memory for the TreeMap of definitions.  Calls gc().
110     * 
111     * @see #GC()
112     */
113    public static void releaseDefinitions()
114    { definitions.clear(); definitions = null; GC(); }
115
116    // ********************************************************************************************
117    // ******************** View and Inspect the data in the data-files ***************************
118    // ********************************************************************************************
119
120    /**
121     * <CODE>Web-Files (Java-Script) Documentation.</CODE><BR /><BR />
122     * 
123     * <EMBED CLASS="external-html" DATA-FILE-ID="WEBFJS">
124     */
125     public static class WebFiles
126     {
127        private WebFiles() { }
128
129        /**
130         * Extracts Java-Script Functions from a Data-File and returns the functions as a String.
131         * 
132         * @return This simply returns the necessary Java-Script file as a Java String that
133         * contains all verb definitions.  Save this file, transmit it, convert it.  Generally it
134         * can be used to make definition pop-up windows in JS.
135         * 
136         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
137         */
138        public static String JSgetDefinitionsCode()
139        { 
140            return LFEC.readObjectFromFile_JAR
141                (Torello.Data.DataFileLoader.class, DEFINITIONS_JS, true, String.class);
142        }
143
144        /**
145         * Extracts Java-Script Functions from a Data-File and returns the functions as a String.
146         * 
147         * @return This returns the Java-Script file (as a String) that contains the "Verb Popup
148         * Window" code.  Save this file, and store it in an accessible directory when you use the
149         * "addVerbSpans" method.
150         * 
151         * <BR /><BR />Save this file to disk, and put it in your HTML
152         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
153         * 
154         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
155         */
156        public static String JSgetPopupCode()
157        {
158            return LFEC.readObjectFromFile_JAR
159                (Torello.Data.DataFileLoader.class, POPUP_WIN_JS, true, String.class);
160        }
161
162        /**
163         * Extracts CSS Declaractions from a Data-File and returns the CSS as a String.
164         * 
165         * @return This returns the CSS file (as a String) for formatting the "Verb Popup Window."
166         * If a call to method addVerbSpans is used, and the three files listed (3 Java-Script, and
167         * 1 CSS) are included in the directory of the output page, then verb-conjugation pop-up
168         * windows will be functioning.
169         * 
170         * <BR /><BR />Save this file to disk, and put it in your HTML
171         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
172         * 
173         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
174         */
175        public static String CSSgetCode()
176        {
177            return LFEC.readObjectFromFile_JAR
178                (Torello.Data.DataFileLoader.class, POPUP_WIN_CSS, true, String.class);
179        }
180
181        /**
182         * Extracts Java-Script Functions from a Data-File and returns the functions as a String.
183         * 
184         * @return This returns the last Java-Script file you will need to put a "Verbs Pop-up
185         * Window" on your Spanish HTML documents.
186         * 
187         * <BR /><BR />Save this file to disk, and put it in your HTML
188         * {@code <HEADER>...<SCRIPT TYPE="text/javascript">} section!
189         * 
190         * @see LFEC#readObjectFromFile_JAR(Class, String, boolean, Class)
191         */
192        public static String JSgetIrregulars()
193        { 
194            return LFEC.readObjectFromFile_JAR
195                (Torello.Data.DataFileLoader.class, IRREG_CONJ_JS, true, String.class);
196        }
197     }
198
199    // ********************************************************************************************
200    // ********************** View and Inspect the data in the data-files *************************
201    // ********************************************************************************************
202
203    /**
204     * Generates an iterator of Spanish Verb Infinitives.  Items may not be removed via the
205     * iterator's {@code 'remove()'} method.
206     * 
207     * @return An iterator of all Spanish Verbs loaded into the infinitives TreeSet.
208     * 
209     * @see RemoveUnsupportedIterator
210     */
211    public static Iterator<String> infinitives()
212    { return new RemoveUnsupportedIterator<String>(infinitives.iterator()); }
213
214    /**
215     * Generates an iterator of Spanish Irregular-Verbs in Infinitive Form.  Items may not be
216     * removed via the iterator's {@code 'remove()'} method.
217     * 
218     * @return An iterator of all Irregular Spanish Verbs loaded into the irregular-infinitives
219     * TreeSet.
220     * 
221     * @see RemoveUnsupportedIterator
222     */
223    public static Iterator<String> irregularInfinitives()
224    { return new RemoveUnsupportedIterator<String>(irregularInfinitives.iterator()); }
225
226    /**
227     * Gets the quick-definition of a Spanish Verb.<BR />
228     * <B>EXPECTATIONS:</B>
229     * 
230     * <BR /><BR /><UL CLASS="JDUL">
231     * <LI>The "definitions" data file must already be loaded into memory</LI>
232     * <LI>To be precise, loadIDefinitions() needs to have been called!</LI>
233     * <LI>word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results might be
234     * inaccurate!</LI>
235     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure.</LI>
236     * </UL>
237     * 
238     * @param infinitiveInLowerCase This may be any Spanish Verb - <I><B>as long as it is in the
239     * infinitive form.</I></B>
240     * 
241     * @return Will return the string stored as the value in the
242     * {@code TreeMap<String, String>} <I>definitions</I>, and null if this infinitive is not
243     * found in the  dictionary.
244     * 
245     * @see ES#toLowerCaseSpanish(String)
246     */
247    public static String getDefinition(String infinitiveInLowerCase)
248    { return definitions.get(infinitiveInLowerCase); }
249
250    /**
251     * Get the infinitive form of a verb-string.<BR />
252     * <B>EXPECTATIONS:</B>
253     * 
254     * <BR /><BR /><UL CLASS="JDUL">
255     * <LI>The "conjugations" data file must already be loaded into memory</LI>
256     * <LI>To be precise, loadIConjugations() needs to have been called!</LI>
257     * <LI>word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results might be
258     * inaccurate!</LI>
259     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure.</LI>
260     * </UL>
261     * 
262     * @param wordInLowerCase This can be any word (in Spanish... or any language for that
263     * matter).
264     * <BR /><BR />It is expected to be a conjugated form of a Spanish verb.  <I>If it
265     * is...</I> The original infinitive form of that verb will be returned.
266     * 
267     * @return <UL CLASS="JDUL">
268     * <LI>Returns the Infinitive of a verb - if the word passed is a direct conjugation of that
269     * verb.</LI>
270     * <LI>Returns null if there are no matching verbs conjugations in {@code private static String
271     * conjugations}</LI>
272     * </UL>
273     */
274    public static String getInfinitive(String wordInLowerCase)
275    {
276        // Eliminates common words that aren't verbs - but conjugate .. "para" "como"
277        // for (int k=0; k < skip.length; k++) if (wtlc.equals(skip[k])) return null;
278
279        // GREP through the conjugations data file (stored in String: conjugations)
280        int pos = conjugations.indexOf(" " + wordInLowerCase + ",");
281        if (pos == -1)  if (wordInLowerCase.charAt(wordInLowerCase.length() - 1) == 'r')
282            pos = conjugations.indexOf("\n" + wordInLowerCase + ":");
283
284        // the post-increment (++) is for the infinitive case match.
285        // Specifically, the first character, in this (the infinitive) case, would be a 
286        // newline '\n'.. and a '\n' character is exactly what the loop which follows is
287        // grep'ing for...
288        if (pos == -1) return null; else pos++;
289        
290        // There *WAS* a match in the conjugations data file. - get infinitive and return
291        while ((conjugations.charAt(--pos) != '\n') && (pos > 0));
292        return conjugations.substring(pos + 1, conjugations.indexOf(':', pos + 1));
293    }
294
295    /**
296     * Checks if a word is an irregular verb.
297     *
298     * <BR /><BR /><B>EXPECTATIONS:</B>
299     * 
300     * <BR /><BR /><UL CLASS="JDUL">
301     * <LI>The "irregular infinitives" data file must already be loaded into memory</LI>
302     * <LI>To be precise, loadIrregularInfinitives() needs to have been called!</LI>
303     * <LI>word <B>MUST</B> be in <I>lower-case Spanish</I> - otherwise results will be 
304     * inaccurate!</LI>
305     * <LI><B>TRY:</B> ES.toLowerCaseSpanish(String) to make sure</LI>
306     * </UL>
307     * 
308     * @param infinitiveInLowerCase This may be any Spanish Verb - as long as it is in the
309     * infinitive form.  This word must have been converted <I>to lower case</I>, and if not, it
310     * will likely return null.
311     * 
312     * @return Will return <I>TRUE</I> if this verb is contained by the list of irregular-verbs
313     * Will return <I>FALSE</I> otherwise.
314     * 
315     * @see ES#toLowerCaseSpanish(String)
316     */
317    public static boolean isIrregular(String infinitiveInLowerCase)
318    { return irregularInfinitives.contains(infinitiveInLowerCase); }
319
320
321    // *************** HTML TOKEN AND PUNCTUATION REMOVAL from Vocabulary Strings *****************
322    // Here is the expression I used
323    //     (complete-expression, without marking it up for Java... (escaped-slashes))
324    // This is looking for punctuation: [,-:;'"¿?¡!“”&@\.\?\$\(\)]
325    // The main string can ONLY contain THESE CHARS: [a-z,A-Z,áéíóúüñýÁÉÍÓÚÜÑÝ]
326    // ********************************************************************************************
327    // Match Groups:
328    //          Group 1: ORIGINAL String
329    //          Group 2: The "Prefix" - a.k.a. the "stuff" before the Spanish-Text
330    //          Group 3: The Spanish-text part (expected to be a Spanish Word - but not guaranteed)
331    //          Group 4: The "Suffix" - a.k.a. the "stuff" after the Spanish-Word/Text-token
332    // ********************************************************************************************
333    private static final String     PUNCTUATION = "([,-:;\'\"¿?¡!“”&@\\.\\?\\$\\(\\)\\s]*)";
334    private static final String     LANGUAGE    = "([a-zA-ZáéíóúüñýÁÉÍÓÚÜÑÝ]*?)"; 
335    private static final String     RE1         = "(^" + PUNCTUATION + LANGUAGE + PUNCTUATION + "$)";
336    private static final Pattern    P1          = Pattern.compile(RE1, Pattern.CASE_INSENSITIVE);
337    // ********************************************************************************************
338
339    /**
340     * This software is not perfect.  Human language is a new order of issues.  There are many
341     * features that could be added to make a better translator, but I have been busy writing an
342     * HTML Scrape Package instead.  When you see this array, what it means is that these words are
343     * extremely common words in Spanish, but usually, in about 80% to 90% of cases, aren't verbs.
344     * A "Lexical Analysis" could probably figure out much better when a word is guaranteed to be
345     * verb, but for now, these words are "just skipped" and never identified as verbs at all.
346     * 
347     * <BR /><BR /><B>NOTE:</B> You may change this at your discretion, just re-assign the array.
348     */
349    // When these wordsa are found in the newspaper, don't include them at all!
350    public static String[] skip =
351    { "como", "casa", "para", "uno", "una", "cosa", "nada", "entre", "dallas" };
352
353    /**
354     * This will call the "addSpanishVerbSpans" on each {@code TextNode} found in the page 
355     * {@code Vector}.
356     * 
357     * @param regularVerbsFound If this parameter isn't null, than any and all regular verbs found
358     * within  the text will be added to this {@code TreeSet}.  If this parameter is null, it will
359     * be ignored.
360     * 
361     * @param irregularVerbsFound If this parameter isn't null, than any irregular-verbs found in
362     * this text will be added to this {@code TreeSet}.  If this parameter is null, it will be
363     * ignored.
364     * 
365     * @param wordsNotFound All words that are found, and aren't verbs are entered into this
366     * {@code TreeSet}, if this  parameter is not null.  If this parameter is null, it will be
367     * ignored.
368     * 
369     * @see #addSpanishVerbSpans(String, TreeSet, TreeSet, TreeSet)
370     */
371    public static void addSpanishVerbSpans( 
372        Vector<HTMLNode> page,
373        TreeSet<String> regularVerbsFound,
374        TreeSet<String> irregularVerbsFound,
375        TreeSet<String> wordsNotFound
376    )
377    {
378        HTMLNode n;
379        for (int i=0; i < page.size(); i++)
380            if ((n = page.elementAt(i)) instanceof TextNode)
381            {
382               Vector<HTMLNode> withSpans = addSpanishVerbSpans
383                        (n.str, regularVerbsFound, irregularVerbsFound, wordsNotFound);
384
385               page.removeElementAt(i);
386               page.addAll(i, withSpans);
387               i += withSpans.size() - 1;   
388                    // Trust me, this is right!
389                    // If "withSpans.size() == 1" (a.k.a. "no-change"), then should do: i += 0;
390                    // If "withSpans.size() == 2" (increased by 1), then should do: i += 1;
391            }
392    }
393
394    /**
395     * The purpose of this class is to go through the Spanish Verbs in an HTML page, and replace
396     * 
397     * @param regularVerbsFound If this parameter isn't null, than any and all regular verbs found
398     * within  the text will be added to this {@code TreeSet}.  If this parameter is null, it will
399     * be ignored.
400     * 
401     * @param irregularVerbsFound If this parameter isn't null, than any irregular-verbs found in
402     * this text  will be added to this {@code TreeSet}.  If this parameter is null, it will be
403     * ignored.
404     * 
405     * @param wordsNotFound All words that are found, and aren't verbs are entered into this
406     * {@code TreeSet}, if this  parameter is not null.  If this parameter is null, it will be
407     * ignored.
408     * 
409     * @return An html sub-page (as a {@code Vector}) where each found Spanish-Verb has been
410     * surrounded by an HTML {@code <SPAN>} element  that indicates the regularity of the verb,
411     * and it's infinitive-form conjugation.
412     * 
413     * @see ES#onlyLanguageChars(String)
414     * @see ES#toLowerCaseSpanish(String)
415     * @see HTMLPage#getPageTokens(CharSequence, boolean)
416     */
417    public static Vector<HTMLNode> addSpanishVerbSpans(
418        String text,
419        TreeSet<String> regularVerbsFound,
420        TreeSet<String> irregularVerbsFound,
421        TreeSet<String> wordsNotFound
422    )
423    {
424        boolean keepRV  = regularVerbsFound     != null;    // Keep list of found regular-verbs in the tree-set
425        boolean keepIV  = irregularVerbsFound   != null;    // Keep list of found irregular-verbs in the tree set
426        boolean keepNV  = wordsNotFound         != null;    // Keep list of words that weren't verbs in the tree-set
427
428        StringBuilder outSB = new StringBuilder();
429
430        // Splits the string by spaces
431        String[] words = text.split(" ");
432        
433        for (int j=0; j < words.length; j++)
434        {
435            // Sometimes it is the empty string or just white-space
436            String trim = words[j].trim();
437            if (trim.length() == 0)
438                { outSB.append(" " + words[j]); continue; }
439            
440            // Eliminates leading and trailing punctuation & HTML tags
441            Matcher m = P1.matcher(trim);
442
443            if (! m.find())
444                { outSB.append(" " + words[j]); continue; }
445
446            String pre  = m.group(2);
447            String word = m.group(3);
448            String post = m.group(4);
449
450            if (! ES.onlyLanguageChars(word)) System.out.println
451                ("ORIG: [" + words[j] + "], " + pre + ", " + word + ", " + post);
452
453            if (word            == null)    { outSB.append(" " + words[j]); continue; }
454            if (pre             == null)    pre = "";
455            if (post            == null)    post = "";
456            if (word.length()   == 0)       { outSB.append(" " + words[j]); continue; }
457
458            String lc = ES.toLowerCaseSpanish(word);
459
460            // Skip the "ultra-common" non-verbs that look just like verbs.
461            for (String w : skip) if (lc.equals(w)) continue;
462
463            String infinitive=  getInfinitive(lc);
464
465            if (infinitive == null)
466                { if (keepNV) wordsNotFound.add(lc); continue;  }
467            else
468                { if (keepRV) regularVerbsFound.add(infinitive); }
469
470            outSB.append(" " + pre + "<SPAN CLASS=\"");
471
472            if (isIrregular(infinitive))
473                { outSB.append('I'); if (keepIV) irregularVerbsFound.add(infinitive); }
474            else
475                { outSB.append('R'); }
476
477            outSB.append("V\" DATA-V=\"" + infinitive + "\">" + word + "</SPAN>" + post);
478        }
479
480        outSB.append('\n');
481
482        return HTMLPage.getPageTokens(outSB, false);
483    }
484}