1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package Torello.HTML.parse;

import java.util.regex.*;

public final class HTMLRegEx
{
    // Used by class ParserRE to parse comment nodes, and by Torello.HTML.TagNode
    public static final Pattern P1 = Pattern.compile
        ("<\\/?(\\w{1,127})"                    +
            "(?:"   +   "[\\w-]+=\"[^\"]*?\""   + "|"   // attribute="any valid string, without (the same) quote"
                    +   "[\\w-]+='[^']*?'"      + "|"   // attribute='any valid string without (the same) quote'
                    +   "[\\w-]+=[\\w-]*"       + "|"   // attribute=any-valid-string-no-spaces-or-punctuation-etc
                    +   "[\\w-]+"               + "|"   // attribute
                    +   "\\s+"                  + "|"   // any white-space
                    +   "[^>]+"                 + ")*"  // Any miscellaneous characters ("Junk?"), *EXCEPT* a "greater-than"
                                                        // (MUST be THE LAST or-option)
                                                        //      NOTE: The above "|" (or-branch), *MUST* be at the end... or else
                                                        //            It will match everything, (except '>'), and miss the whole point.
                                                        //            (Specifically, the first three attribute-value pair clauses are
                                                        //             how to avoid the greater-than-within-tag problem!!!  
            + "\\/?>",                                  // Ending-HTML-Tag symbol is a "greater-than" or "slash-greater-than"
            Pattern.DOTALL  );

    // (Package-Local RegEx) Used by class ParserRE and ParserHM to parse comment-nodes.
    public static final Pattern P2 = Pattern.compile("<!--.*?-->", Pattern.DOTALL);

}