001package Torello.Java;
002
003import java.util.*;
004import java.io.*;
005import java.util.regex.*;
006import java.util.zip.*;
007
008/**
009 * <CODE>RegExFiles - Documentation.</CODE><BR /><BR />
010 * <EMBED CLASS="external-html" DATA-FILE-ID="REF">
011 */
012@Torello.HTML.Tools.JavaDoc.StaticFunctional
013public class RegExFiles
014{
015    private RegExFiles() { }
016
017    /**
018     * This loads a regular expression text file.  Each line is interpreted as a new Regular
019     * Expression {@code Pattern}.
020     *
021     * <BR /><BR /><B>NOTE:</B> This method expects the <B><I>entire regular expression to fit on
022     * a single line</I></B>, and therefore, each new line containing text-data (without a
023     * starting <B>{@code '#'}</B>) will be compile into a new regular expression.  Use the
024     * {@code '\n'} within the expression to generated newlines.
025     *
026     * <BR /><BR /><B>Notes about Syntax Rules:</B>
027     * <BR /><BR /><UL CLASS="JDUL">
028     * <LI><B>Comment lines</B> are lines beginning with the <I>POUND</I> <B>({@code '#'})</B>
029     *        sign.
030     * </LI>
031     * <LI><B>Blank lines</B> are ignored by the file-parse completely.</LI>
032     * <LI>Lines with <B>only white-space</B> are considered blank.</LI>
033     * <LI><B>Flag Lines</B> are lines that begin with two, successive, <I>POUND</I>
034     *     <B>({@code '##'})</B> signs.
035     * </LI>
036     * <LI>All non-comment, non-blank and non-flag lines are converted into Regular-Expression
037     *     {@code Pattern's}
038     * </LI>
039     * </UL>
040     *
041     * <BR /><BR /><B>IMPORTANT:</B> This method will <I>halt program execution</I> if any
042     * exceptions occur when loading a Regular-Expression text file!  This is the purpose of 
043     * {@code 'LFEC'} - Load File Exception Catch.
044     *
045     * @param f Filename for a Regular Expression
046     *
047     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
048     * &amp; blank lines will all be ignored.
049     *
050     * @see java.util.regex.Pattern
051     * @see #generateFlags(String)
052     * @see LFEC#ERROR_EXIT(Throwable, String)
053     */
054    public static Vector<Pattern> LFEC(String f)
055    {
056        try
057            { return parse(FileRW.loadFileToVector(f, false), f); }
058        catch (Throwable t)
059            { LFEC.ERROR_EXIT(t, "Attempt to load Regular Expression file: [" + f + "], failed.\n"); }
060
061        return null; // Should NOT be possible to reach this statement...
062    }
063
064    /**
065     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but loads the file into a
066     * {@code Vector} using the "JAR File" information included here.  In this case, parameter
067     * {@code f} indicates a jar-file class-loader pointer.  It will not load from the standard
068     * file-system.
069     *
070     * <BR /><BR /><B>NOTE:</B> The JAR implies that the "load resource as stream" function is
071     * being used in place of standard file i/o routines.  Specifically, this loads from a Jar
072     * file!
073     *
074     * <BR /><BR /><B>LOADS:</B>
075     * <DIV CLASS=SNIP>{@code
076     * BufferedReader br =
077     *     new BufferedReader(new InputStreamReader(c.getResourceAsStream(f)));
078     * }</DIV>
079     *
080     * @param c This contains the class that is loading the file.  It is not too important to use
081     * the "exact class" - since the only reason the class doing the loading is because the
082     * "Class Loader" employs the exact "Package Name" of the class for figuring out the
083     * directory / sub-directory where the data-file is stored.  This variable may not be null.
084     *
085     * <BR /><BR /><B>EXAMPLE:</B> If you wanted to load a "Regular Expressions.txt" file that
086     * was in the same BASH/Debian/etc...  directory as the following class - the following call
087     * to {@code 'RegExFiles'} would load the text-file "Regular Expressions.txt" into memory
088     * quickly.  The primary purpose being that text files are <B><I>much easier to read than
089     * 'double-escaped' Java {@code String's}.</I></B>
090     *
091     * <BR /><BR /><B>NOTE:</B> It might be important to read the Java Doc's about the
092     * {@code 'getResourceAsStream(String)'} method for retrieving data that was stored to a JAR
093     * file instead of a UNIX/BASH/MS-DOS system file.  Oracle's Java 8 would help.
094     *
095     * <EMBED CLASS="external-html" DATA-FILE-ID="RAWTYPES">
096     *
097     * @param f This is a file-pointer to a file stored inside a Java JAR file.
098     *
099     * @return A Vector containing one compiled regular expression per line.  Comment lines &amp;
100     * blank lines will all be ignored.
101     *
102     * @see #LFEC(String)
103     * @see #parse(Vector, String)
104     * @see LFEC#ERROR_EXIT(Throwable, String)
105     */
106    public static Vector<Pattern> LFEC_JAR(Class<?> c, String f)
107    {
108        try {
109            InputStream     is      = c.getResourceAsStream(f);
110            BufferedReader  br      = new BufferedReader(new InputStreamReader(is));
111            String          s       = "";
112            StringBuilder   sb      = new StringBuilder();
113            Vector<String>  file    = new Vector<String>();
114
115            while ((s = br.readLine()) != null) file.addElement(s);
116
117            is.close();
118
119            return parse(file, f);
120
121        }
122        catch (Throwable t)
123        { 
124            LFEC.ERROR_EXIT(t,
125                "Attempted to load Regular Expression file: [" + f + "]\n" +
126                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
127                "Did not load successfully."
128            );
129        }
130
131        return null;    // Should NOT be possible to reach this statement...
132                        // Compiler does not recognize LFEC.ERROR_EXIT
133    }
134
135    /**
136     * This is identical to {@code LFEC_JAR}, except that it presumes the file was compressed
137     * before saving.
138     *
139     * @param c This contains the class that is loading the file.  It is not too important to use
140     * the "exact class" - since the only reason the class doing the loading is because the "Class
141     * Loader" employs the exact "Package Name" of the class for figuring out the directory /
142     * sub-directory where the data-file is stored.  This variable may not be null.  Again, the
143     * class-loader looks in the directory of the package that contains this class!
144     *
145     * <BR /><BR /><B>NOTE:</B> The method {@code public static Vector<Pattern> 
146     * LFEC_JAR(Class, String;)} has a more detailed look at the particular use of this parameter.
147     * The easy way to understand is: just pass the class that is doing the actual loading of the
148     * regular-expression <B><I>(presuming the regex.dat file is in the same directory as the 
149     * {@code '.class'} file!)</I></B>
150     *
151     * <EMBED CLASS="external-html" DATA-FILE-ID="RAWTYPES">
152     *
153     * @param f This is a file-pointer to a file stored inside a Java JAR file.
154     *
155     * @return A {@code Vector} containing one compiled regular expression per line.  Comment
156     * lines &amp; blank lines will all be ignored.
157     *
158     * @see #LFEC_JAR(Class, String)
159     * @see #parse(Vector, String)
160     * @see LFEC#ERROR_EXIT(Throwable, String)
161     */
162    public static Vector<Pattern> LFEC_JAR_ZIP(Class<?> c, String f)
163    {
164        try {
165            InputStream         is          = c.getResourceAsStream(f);
166            GZIPInputStream     gzip        = new GZIPInputStream(is);
167            ObjectInputStream   ois         = new ObjectInputStream(gzip);
168            Object              ret         = ois.readObject();
169            String              fileStr     = (String) ret;
170            Vector<String>      file        = new Vector<>();
171            int                 newLinePos  = 0;
172
173            is.close();
174
175            while ((newLinePos = fileStr.indexOf('\n')) != -1)
176            {
177                file.addElement(fileStr.substring(0, newLinePos));
178                fileStr = fileStr.substring(newLinePos + 1);
179            }
180
181            return parse(file, f);
182
183        } catch (Throwable t)
184        {
185            LFEC.ERROR_EXIT(t,
186                "Attempted to load Regular Expression file: [" + f + "]\n" +
187                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
188                "Content was zipped, but failed to load."
189            );
190        }
191
192        return null; // Should NOT be possible to reach this statement...
193    }
194
195    /**
196     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but takes a "pre-loaded file"
197     * as a {@code Vector}.  This is an internal class - used to ensure that the methods:
198     * {@code LFEC_JAR} and {@code LFEC} do the exact same thing.
199     *
200     * @param file This presumes that the regular-expression text-file has been loaded into a
201     * {@code Vector<String>} (w/out the "include newlines" option!)
202     *
203     * @param name The name of the file loading is required so that error-printing-information is
204     * easier.
205     *
206     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
207     * &amp; blank lines will all be ignored.
208     *
209     * @see #LFEC(String)
210     */
211    protected static Vector<Pattern> parse(Vector<String> file, String name)
212    {
213        try {
214            Vector<Pattern> ret     = new Vector<Pattern>();
215            int             flags   = 0;
216
217            for (String line : file)
218            {
219                if (line.trim().length() == 0) continue;
220
221                if (line.charAt(0) == '#')
222                {
223                    if (line.length() > 1) if (line.charAt(1) == '#') flags = generateFlags(line);
224                    continue;
225                }
226
227                if (flags != 0)                 ret.add(Pattern.compile(line, flags));
228                else                            ret.add(Pattern.compile(line));
229
230                flags = 0;
231            }
232
233            return ret;
234        }
235        catch (Throwable t)
236            { LFEC.ERROR_EXIT(t, "error parsing regular expression file: " + name); }
237
238        return null; // Should NOT be possible to reach this statement...
239    }
240
241    /**
242     * This information has been copied from Java's regular expression: {@code Pattern}. This is a
243     * Helper function as it converts the text-{@code String's} into their constants, so that a
244     * user may include these text {@code String's} in a regular expression file.
245     *
246     * <BR /><BR /><B>NOTE:</B> The regular expression loader will only load regular expressions
247     * that fit on a single line of text.  Other than lines that begin with a comment, each line
248     * is intended/interpreted as an independent Regular Expression.
249     *
250     * @see java.util.regex.Pattern
251     */
252    protected static int generateFlags(String line)
253    {
254        int mask = 0;
255
256        if (line.contains("CANON_EQ"))          mask |= Pattern.CANON_EQ;
257        if (line.contains("CASE_INSENSITIVE"))  mask |= Pattern.CASE_INSENSITIVE;
258        if (line.contains("DOTALL"))            mask |= Pattern.DOTALL;
259        if (line.contains("COMMENTS"))          mask |= Pattern.COMMENTS;
260        if (line.contains("LITERAL"))           mask |= Pattern.LITERAL;
261        if (line.contains("MULTILINE"))         mask |= Pattern.MULTILINE;
262        if (line.contains("UNICODE_CASE"))      mask |= Pattern.UNICODE_CASE;
263
264        return mask;
265    }
266}