001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.HTMLNode;
004import Torello.HTML.PageStats;
005import java.util.Vector;
006import java.io.Serializable;
007import java.net.URL;
008
009/**
010 * <CODE>Article - Documentation.</CODE><BR /><BR />
011 * <EMBED CLASS="external-html" DATA-FILE-ID="A">
012 */
013public class Article implements Serializable
014{
015    /** <EMBED CLASS="external-html" DATA-FILE-ID="SVUID">  */
016    protected static final long serialVersionUID = 1;
017
018    /**
019     * This should inform the user that an error occurred when downloading an article. If this
020     * field,  after instantiation is <B>TRUE</B>, all other fields in this class should be thought
021     * of as "irrelevant."
022     */
023    public final boolean                wasErrorDownload;
024
025    /** This is the article's URL from the news website. */
026    public final URL                    url;
027
028    /**
029     * This is the title that was scraped from the main page.  The title is the content of the
030     * {@code <TITLE>...</TITLE>} element on the article HTML page.
031     */
032    public final String                 titleElement;
033
034    /**
035     * This is the original, and complete, HTML vectorized-page download.  It contains the
036     * original, un-modified, article download.
037     */
038    public final Vector<HTMLNode>       originalPage;
039
040    /**
041     * This is the pared down article-body.  It is what is retrieved from {@code class ArticleGet}
042     */
043    public final Vector<HTMLNode>       articleBody;
044
045    /**
046     * The image-URL's that were found in the news-article.  The easiest way to think about this
047     * field is that the following instructions were called on the article-body after downloading
048     * the article:
049     * 
050     * <BR /><BR /><DIV CLASS="SNIP">{@code
051     * Vector<TagNode> imageNodes  = TagNodeGet.all(article, TC.OpeningTags, "img");
052     * Vector<URL>     imageURLs   = Links.resolveSRCs(imageNodes, articleURL);
053     * 
054     * // The results of the above call are stored in this field / Vector<URL>.
055     * }</DIV>
056     */
057    public final Vector<URL>            imageURLs;
058
059    /**
060     * This list contains the "Image Positions" inside the vectorized-article for each image that
061     * was found inside the article.  The easiest way to think about this field is that the
062     * following instructions were called on the article-body after downloading that article:
063     * 
064     * <BR /><BR /><DIV CLASS="SNIP">{@code
065     *  int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
066     * }</DIV>
067     */
068    public final int[]                  imagePosArr;
069
070    /**
071     * This contains an instance of {@code class PageStats} that has been generated out of an
072     * original Newspaper Article Page.
073     * 
074     * <DIV CLASS="LOC">{@code 
075     * this.originalPageStats = new PageStats(originalPage);
076     * }</DIV>
077     */
078    public final PageStats              originalPageStats;
079
080    /**
081     * This contains an instance of {@code class PageStats} that has been generated from the
082     *  post-processed Newspaper Article.
083     * <DIV CLASS="LOC">{@code 
084     * this.processedArticleStats = new PageStats(articleBody);
085     * }</DIV>
086     */
087    public final PageStats              processedArticleStats;
088
089
090    public Article(
091        URL                 url,
092        String              titleElement,
093        Vector<HTMLNode>    originalPage,
094        Vector<HTMLNode>    articleBody,
095        Vector<URL>         imageURLs,
096        int[]               imagePosArr
097    )
098    {
099        this.wasErrorDownload       = false;
100        this.url                    = url;
101        this.titleElement           = titleElement;
102        this.originalPage           = originalPage;
103        this.articleBody            = articleBody;
104        this.imageURLs              = imageURLs;
105        this.imagePosArr            = imagePosArr;
106        this.originalPageStats      = (originalPage == null) ? null : new PageStats(originalPage);
107        this.processedArticleStats  = new PageStats(articleBody);
108    }
109}