FILE:, OUTPUT FILE: eMailAddresses.txt


package Torello.OLD.Y2019.M04;  // My "temp directory" package - rename to anything you want.

import Torello.Java.*;
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import java.util.*;
import java.util.regex.*;

 * The purpose of this class is to perform a "web-crawl" of most of the sub-sections in the Boston Globe,
 * and identify/create a complete list of e-mail addresses who domain ends with "".  This will produce
 * an output flat-file that contains a complete list of journalists working for "The Globe" (so you may harrass them
 * and accuse them of mayhem - which is what I do.)
public class ScrapeBostonGlobe
    // A small sub-set of the Boston Globe Newspaper Sections to search
    static final String[]           sections        = { "", "metro", "business", "news/politics", "lifestyle", "arts" };

    // This is a Regular-Expression Pattern (see for more help)
    // that matches Strings.  It looks for Strings that "look like" e-mail addresses AND end with ""
    static final Pattern            P               = Pattern.compile("[\\w\\.]", Pattern.CASE_INSENSITIVE);

    // TreeSet<String> is just a simple Java Data-Structure that eliminates duplicates (and sorts the elements, too).
    static final TreeSet<String>    eMailAddresses  = new TreeSet<>();

    public static void main(String[] argv) throws IOException
        System.out.println("Scraping E-Mail Addresses from Boston Globe Website.");

        for (String section : sections)
            // Instantiate a object for this section.
            // NOTE: Most (but not all) websites expect an "https://"  ***NOT***  an "http://" connection.
            // If "http" is accidentally used anywhere with this newspaper, your Java-connection will be ignored!
            URL                     url             = new URL("" + section);

            // A Torello.HTML.HTMLNode "vector-ized" version of a primary-section of Boston's Globe Newspaper Website
            Vector<HTMLNode>        v               = HTMLPage.getPageTokens(url, false);

            // This Vector will contain all HTML-Anchor (<A ...>) elements found on this particular "Main Section" page.
            Vector<TagNode>         anchors         = InnerTagGet.all(v, "a", "href");

            // *****
            // A more well-thought-out version of this program/module would check for "duplicate URL's" as we scan through
            // these sections - and also check more sections!  For brevity and clarity of this example, though, I do not scan
            // or remove duplicate URL's (of which there are sometimes hundreds!) and only check 6 sections of the newspaper.
            // Showing off Java's TreeSet data-structure (though useful!) doesn't really help understanding these "scrape" tools.
            // *****

            // This will strip/copy each of the "HREF=..." attribute-parts of the HTML-Anchor (<A HREF=..>) element.
            // This will then convert each HREF-reference into total/complete URL - since, very often, the HTML
            // coders will use only partial-URL's rather than full-URL's.
            // If an element in this Vector is null, the HREF could not be resolved - make sure to check in the loop below!
            Vector<URL>             articleURLs     = Links.resolveHREFs(anchors, url);

            System.out.println("Visiting Section:" + section);
            System.out.println("\tPage has (" + v.size() + ") HTMLNodes, and (" + anchors.size() + ") \"Anchor\" <A> TagNodes.");

            // This for-loop will visit each "resolved URL".  Many of these URL's (but not all of them!) will be newspaper articles.
            // NOTE: the "Links.resolveHREFs" will occasionaly return null as a value, make sure to skip on null.
            for (URL articleURL : articleURLs)
                if (articleURL != null)
                    try {
                        // Iterate through all HTMLNode.str values that are found on each downloaded (articleURL) page!
                        for (HTMLNode n : HTMLPage.getPageTokens(articleURL, false))
                            // Scan every node and see there are any n.str that contain a "" e-mail address
                            Matcher m = P.matcher(n.str);
                            while (m.find())
                                String eMailAddr =;

                                // Save each match into the "TreeSet<String>" data-structure
                                // (duplicates will be automatically removed - make sure to convert all to lower-case!)
                    } catch (Exception e) { System.out.println( "\tEXCEPTION-ERROR: " + e.getMessage() + "\n" +
                                                                "\tSKIPPING URL: " + articleURL.toString());     }

        // Print the results to flat-file, and save.
        StringBuffer sb = new StringBuffer();
        for (String eMailAddress : eMailAddresses) sb.append(eMailAddress + "\n");
        FileRW.writeFile(sb, "eMailAddresses.txt");

Terminal Output:

Scraping E-Mail Addresses from Boston Globe Website.
Visiting Section:
	Page has (4492) HTMLNodes, and (245) "Anchor"  TagNodes.
Visiting Section:
	Page has (3435) HTMLNodes, and (143) "Anchor"  TagNodes.
	SKIPPING URL: http://www/bostonglobe/com/2018/07/27/farragherxref/6UYUWphAkotvRWmvrOWuJJ/story.html
	SKIPPING URL: http://www/bostonglobe/com/2018/07/27/walkerxref/aGnRxbsEB50lF1kQ5m70JJ/story.html
Visiting Section:
	Page has (3509) HTMLNodes, and (127) "Anchor"  TagNodes.
Visiting Section:
	Page has (1528) HTMLNodes, and (68) "Anchor"  TagNodes.
Visiting Section:
	Page has (3548) HTMLNodes, and (159) "Anchor"  TagNodes.
Visiting Section:
	Page has (3510) HTMLNodes, and (139) "Anchor"  TagNodes.