1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package Torello.HTML.parse;

import Torello.HTML.*;
import Torello.Java.FileRW;

import java.util.Vector;
import java.io.IOException;
import java.util.regex.*;

final class ParserREInternal
{
    // SAME AS THIS REGULAR-EXPRESSION: [\s>\/]
    private static final char[] validTokenPlusOneChars = { '>', '/', ' ', '\t', '\n', (char) 11, '\f', '\r' };

    static void getTokens(
            Vector<HTMLNode> ret,
            String htmlStr, int sPos, int ePos,
            boolean eliminateHTMLTags,
            String matchesFile, String justTextFile
        )
        throws IOException
    {
        boolean             logMatches  = matchesFile != null;
        boolean             logJustText = justTextFile != null;

        Matcher             m           = HTMLRegEx.P1.matcher(htmlStr);
        StringBuffer        matches     = logMatches	? new StringBuffer() : null;
        StringBuffer        justText    = logJustText	? new StringBuffer() : null;
        int                 start       = sPos; // HTML RegEx Matcher 'start' string-index
        int                 end	        = sPos; // HTML RegEx Matcher 'end' string-index
        int                 cursor      = sPos; // The "loop counter"
        final int           HTML_EOF    = ePos; // The "imagined EOF" (prevents sub-string)
        final byte          MAX_TOK_LEN = HTMLTags.maxTokenLength();
                                                // Longest (currently registered) HTML String-Token Length

        // Main loop breaks whenever one of the inner while-loops reaches EOF...
        while (true)
        {
            // All (100%) of HTML Elements begin with the less-than '<' symbol.  Advance the
            // cursor until one is found, or we fall off the end of the page.
            while (     (cursor < HTML_EOF) 
                    &&  (htmlStr.charAt(cursor) != '<')) cursor++;

            // If we have reached EOF (or sub-page), before finding the '<' STOP IMMEDIATELY.
            if (cursor == HTML_EOF)                                 break;

            // Start now holds the position of the next less-than symbol on the page
            start = cursor;
            cursor++;
            if (cursor == HTML_EOF)                                 break;

            // This may be a "Closing Tag" - if so we have to advance the cursor on extra place
            TC openOrClosed = TC.OpeningTags;
            if (htmlStr.charAt(cursor) == '/')
            {
                openOrClosed = TC.ClosingTags;
                cursor++;
                if (cursor == HTML_EOF)                             break;
            }

            // Whether this HTML Element will go on to match as a "TC.OpeningTags" or "TC.ClosingTags"
            // the variable 'tokStartPos' now holds the starting string-index of the HTML Element
            // Tag/Token name
            int tokStartPos = cursor;

            // All HTML Elements have Tag/Token Names that may only contain letters or numbers (are 'Alpha-Numeric').
            // Keep advancing the cursor until EOF, Token-too-long, or a non Alpha-Numeric char is found.
            while (     (cursor < HTML_EOF) 
                    &&  ((cursor - tokStartPos) <= MAX_TOK_LEN)
                    &&  Character.isLetterOrDigit(htmlStr.charAt(cursor))    )
                cursor++;

            // If EOF was reached first, then exit main loop IMMEDIATELY.
            if (cursor == HTML_EOF)                                 break;

            // If the Token String would be too long to match a valid token, start over.
            if ((cursor - tokStartPos) > MAX_TOK_LEN)               continue;

            // Ensure that the first non-alpha-numeric char that was identified is either a 
            // greater-than symbol '>' or was white-space.  If neither, then start over.
            char charAfterToken = htmlStr.charAt(cursor);
            if (    (charAfterToken != '>')
                &&  (! Character.isWhitespace(charAfterToken)))     continue;

            // Eye of Newt, Wool of Bat, Toe of Emoji-Frog, Code-Point (UNICODE) caused a bug once...
            // This solved it -- and unfortunately, the web-address that crashed the parser cannot be found.
            if (htmlStr.codePointAt(cursor) > 255)                  continue;

            // This is the "potential" HTML Element tag/token name.  There is, obviously, a possibility
            // that it is not actually an HTML Element name.
            String token = htmlStr.substring(tokStartPos, cursor);

            // This will verify that the token that was found (was after a less-than '<' symbol)
            // is actually a valid HTML Element name.  If not, the 'hasTag' method will return null.
            TagNode tn = HTMLTags.hasTag(token, openOrClosed);

            // If this wasn't a valid HTML Element name, then skip it, and start over.
            if (tn == null)                                         continue;

            // Set the RegEx Matcher's "Region" (look it up in the JDK JavaDoc's)
            // So that it will match starting at the first less-than-symbol that was found
            // earlier in this loop.  This '<' symbol position was saved to int 'start'
            m.region(start, HTML_EOF);

            // If the RegEx Matcher cannot match the string beginning at 'start', then this
            // simply cannot be an HTML Element.  Review the JDK Doc's for "looking at".  It
            // is convenient... It (basically) asserts that a '^' symbol is included in the RegEx.
            // (BUT NOT A '$') - if you are familiar with the '^' and '$' meanings in RegEx.
            if (! m.lookingAt())                                    continue;

            // This holds the complete HTML Element (including any attributes) from the
            // opening '<' to the closing '>' symbols.
            String htmlTag = m.group();

            // Use the "pre-instantiated" TagNode' - UNLESS the particular TagNode in question has "innerTag"
            // information (like class="..." or HREF="..." or ID="..." - in which case it is a longer string!)
            if (tn.str.length() != htmlTag.length()) tn = new TagNode(htmlTag);

            // Used to be called the "Mis-Match String" (before optimization).  This is *ALL* the
            // text between the PREVIOUS RegEx HTML Element Match, and the start of CURRENT 
            // HTML Element Match.
            String  text                = htmlStr.substring(end, start);  // NOTE! ORDER IS IMPORTANT HERE!
                    end                 = m.end();                        // MUST COME AFTER PREVIOUS LINE!
                    cursor              = end;
            String  trimmedStr          = text.trim();

            // 'text' would have length greater than zero if there were any character/text that
            // occurred between the PREVIOUS HTML Element Match, and CURRENT HTML Element Match.
            if (text.length() != 0)     ret.addElement(new TextNode(text));

            // One (LEGACY) feature that is being maintained (since it was useful), is to eliminate
            // all the HTML TagNodes, and only return the TextNodes...
            if (! eliminateHTMLTags)    ret.addElement(tn);

            // If "TextNode Logging" is requested, do not log the empty (white-space only) TextNodes.
            if (trimmedStr.length() != 0)
            {
                if (logJustText)        justText.append(text);                              // The un-trimmed text-line
                if (logMatches)         matches.append("TEXT:\t\t[" + trimmedStr + "]\n");  // The trimmed text-line
            }

            // EXTREMELY USEFUL (LEGACY / DEBUGGING) Feature, that will be kept / maintained.
            if (logMatches)             matches.append("GROUP():\t[" + htmlTag + "]\n");
        }
        // MAIN WHILE LOOP END

        // The parser was skipping the very last piece of non-HTML text that was occurring after the very last HTML-tag
        // on any given page.   I hadn't checked this part in ages.  Sometimes, for pages that don't begin and end with
        //  <HTML>...</HTML> tag, but rather are just short blubs of HTML, the last sentence was being "dropped".  
        if (end != HTML_EOF)
        {
            // End has the value of the "very last" RegEx-Match End-Pos
            // If this is not the end of the string, then add last snippet of non-HTML text
            String text         = htmlStr.substring(end, HTML_EOF);
            String trimmedStr   = text.trim();

            // NOW Add the text-node.
            ret.addElement(new TextNode(text));

            if (trimmedStr.length() != 0)
            {
                if (logJustText)    justText.append(text);                              // The un-trimmed text-line
                if (logMatches)     matches.append("TEXT:\t\t[" + trimmedStr + "]\n");  // The trimmed text-line
            }
        }

        // Write these String Buffers to a file.
        if (logMatches)		FileRW.appendToFile(matches,	matchesFile);
        if (logJustText)	FileRW.appendToFile(justText,	justTextFile);
    }
}