1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
package Torello.HTML.Tools.NewsSite;

import Torello.HTML.HTMLNode;
import Torello.HTML.PageStats;
import java.util.Vector;
import java.io.Serializable;
import java.net.URL;

/**
 * <CODE>Article - Documentation.</CODE><BR /><BR />
 * <EMBED CLASS="external-html" DATA-FILE-ID="A">
 */
public class Article implements Serializable
{
    /** <EMBED CLASS="external-html" DATA-FILE-ID="SVUID">  */
    protected static final long serialVersionUID = 1;

    /**
     * This should inform the user that an error occurred when downloading an article. If this
     * field,  after instantiation is <B>TRUE</B>, all other fields in this class should be thought
     * of as "irrelevant."
     */
    public final boolean                wasErrorDownload;

    /** This is the article's URL from the news website. */
    public final URL                    url;

    /**
     * This is the title that was scraped from the main page.  The title is the content of the
     * {@code <TITLE>...</TITLE>} element on the article HTML page.
     */
    public final String                 titleElement;

    /**
     * This is the original, and complete, HTML vectorized-page download.  It contains the
     * original, un-modified, article download.
     */
    public final Vector<HTMLNode>       originalPage;

    /**
     * This is the pared down article-body.  It is what is retrieved from {@code class ArticleGet}
     */
    public final Vector<HTMLNode>       articleBody;

    /**
     * The image-URL's that were found in the news-article.  The easiest way to think about this
     * field is that the following instructions were called on the article-body after downloading
     * the article:
     * 
     * <BR /><BR /><DIV CLASS="SNIP">{@code
     * Vector<TagNode> imageNodes  = TagNodeGet.all(article, TC.OpeningTags, "img");
     * Vector<URL>     imageURLs   = Links.resolveSRCs(imageNodes, articleURL);
     * 
     * // The results of the above call are stored in this field / Vector<URL>.
     * }</DIV>
     */
    public final Vector<URL>            imageURLs;

    /**
     * This list contains the "Image Positions" inside the vectorized-article for each image that
     * was found inside the article.  The easiest way to think about this field is that the
     * following instructions were called on the article-body after downloading that article:
     * 
     * <BR /><BR /><DIV CLASS="SNIP">{@code
     *  int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
     * }</DIV>
     */
    public final int[]                  imagePosArr;

    /**
     * This contains an instance of {@code class PageStats} that has been generated out of an
     * original Newspaper Article Page.
     * 
     * <DIV CLASS="LOC">{@code 
     * this.originalPageStats = new PageStats(originalPage);
     * }</DIV>
     */
    public final PageStats              originalPageStats;

    /**
     * This contains an instance of {@code class PageStats} that has been generated from the
     *  post-processed Newspaper Article.
     * <DIV CLASS="LOC">{@code 
     * this.processedArticleStats = new PageStats(articleBody);
     * }</DIV>
     */
    public final PageStats              processedArticleStats;


    public Article(
        URL                 url,
        String              titleElement,
        Vector<HTMLNode>    originalPage,
        Vector<HTMLNode>    articleBody,
        Vector<URL>         imageURLs,
        int[]               imagePosArr
    )
    {
        this.wasErrorDownload       = false;
        this.url                    = url;
        this.titleElement           = titleElement;
        this.originalPage           = originalPage;
        this.articleBody            = articleBody;
        this.imageURLs              = imageURLs;
        this.imagePosArr            = imagePosArr;
        this.originalPageStats      = (originalPage == null) ? null : new PageStats(originalPage);
        this.processedArticleStats  = new PageStats(articleBody);
    }
}