001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.*;
004import Torello.HTML.NodeSearch.*;
005import Torello.Java.*;
006
007import Torello.HTML.Tools.Images.ImageScraper;
008import Torello.HTML.Tools.Images.ImageScraper.AdditionalParameters;
009import Torello.Java.FileNode.RetTypeChoice;
010import Torello.Java.Additional.Ret4;
011import Torello.Java.Shell.C;
012
013import java.util.*;
014import java.io.*;
015
016import java.net.URL;
017import java.util.concurrent.TimeUnit;
018
019/**
020 * <CODE>ScrapeArticles - Documentation.</CODE><BR /><BR />
021 * <EMBED CLASS="external-html" DATA-FILE-ID="SA">
022 */
023@Torello.HTML.Tools.JavaDoc.StaticFunctional
024public class ScrapeArticles
025{
026    private ScrapeArticles() { }
027
028    /**
029     * This is used to do the downloading of newspaper articles.
030     *
031     * @param articleReceiver This is an instance of {@code ScrapedArticleReceiver}.  Whenever an
032     * {@code Article} has successfully downloaded, it will be passed to this 'receiver' class.  
033     * There is a pre-written, standard {@code ScrapedArticleReceiver} that writes to a directory
034     * on the file-system as {@code Article's} are downloaded.  If there is a need to transmit
035     * downloaded {@code Article's} elsewhere, implement that  {@code interface}, and provide an
036     * instance of it to this parameter.
037     *
038     * @param articleURLs this is a parameter that should have been generated by a call to method:
039     * {@code ScrapeURLs.getArticleURLs(...)}
040     *
041     * @param articleGetter This is basically a "Post-Processor" for HTML Web-based newspaper 
042     * articles. This parameter cannot be null.  It is just a simple, one-line, lambda-predicate
043     * which needs to be implemented by the programmer.  Internet news websites (such as: 
044     * {@code news.yahoo.com, cnn.com}, and {@code gov.cn}) have News-Articles on pages that 
045     * contain a lot of extraneous and advertising links and content.  This parameter needs to
046     * extract the {@code Article}-body content from the rest of the page.  <I>This is usually 
047     * very trivial, but it is also mandatory.</I>  Read about the  {@code class ArticleGet} for
048     * more information about extracting the news-content from a Newspaper {@code Article}
049     * web-page.
050     *
051     * @param skipArticlesWithoutPhotos This may be <B>TRUE</B>, and if it is - articles that
052     * contain only textual content will be skipped.  This can be useful for foreign-news sources
053     * where the reader is usually working-harder to understand the content in the first place.
054     * This class is primarily used with foreign-news content websites.  As such, staring at 
055     * pages of Mandarin Chinese or Spanish is usually a lot easier if there is at least one 
056     * photo on the page.  This parameter allows users to skip highly dense articles that do not
057     * contain at least one picture.
058     *
059     * @param bannerAndAdFinder This parameter may be null, but if it is not, it will be used to
060     * skip banner-advertisement images.  This parameter, in reality, does very little.  It
061     * will not actually be used to eliminated advertising images - <I>but rather only to identify
062     * when an image is a banner, advertisement, or spurious picture</I>.  Since this is a news
063     * web-site scraping Java Package, there is a part that allows a user to require that only news
064     * paper articles that contain a photo be downloaded - and the real purpose of including the
065     * {@code 'bannerAndAdFinder'} is to allow the scrape mechanism to 'skip' articles whose only
066     * photos are advertisements.
067     * 
068     * <BR /><BR /><B>NOTE:</B> Again, the primary impetus for developing these tools was for 
069     * scraping and translating news articles from foreign countries like Spain, China, and parts
070     * of South America.  It could be used for any news-source desired.  When reading foreign
071     * language text - it helps "a little bit more" to see a picture.  This parameter is solely 
072     * used for that purpose.
073     * 
074     * <BR /><BR /><B>PRODUCT ADVERTISEMENTS &amp; FACEBOOK / TWITTER LINKS:</B> Removing actual
075     * links about "pinning to Reddit.com" or "Tweeting" articles can be done using either:
076     *
077     * <BR /><BR /><UL CLASS=JDUL>
078     * <LI> {@link ArticleGet} - Writing an instance of {@code ArticleGet} that <B>NOT ONLY</B> 
079     *      extracts the body of a newspaper-article, <B>BUT ALSO</B> performs HTML cleanup using
080     *      the {@code 'Remove'} method of the NodeSearch Package.
081     * </LI>
082     * <LI> {@link HTMLModifier} - Writing a "cleaner" version of the {@code HTMLModifier} lambda
083     *      expression / {@code Function Interface} can also use the NodeSearch classes for
084     *      removing annoying commercials - or buttons about "Sharing a link on Facebook."  The
085     *      class {@link ToHTML} provides a window for accepting an instance of
086     *      {@code HTMLModifier} when converting the generated serialized-data HTML
087     *      {@code Vector's} into {@code '.html' index} files.
088     * </LI>
089     * </UL>
090     *
091     * @param keepOriginalPageHTML When this is <B>TRUE</B>, the original page html will be stored
092     * in the result set.  When this is <B>FALSE</B> null shall be stored in place of the original
093     * page data.
094     *
095     * <BR /><BR /><B>NOTE:</B> The original page HTML is the source HTML that is fed into the
096     * {@code ArticleGet} lambda.  It contains the "pre-processed HTML."
097     *
098     * @param pause If there are many / numerous articles to download, pass an instance of
099     * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time.
100     *
101     * @param log This parameter may not be null, or a {@code NullPointerException} shall throw.
102     * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method.
103     * <EMBED CLASS="external-html" DATA-FILE-ID="APPENDABLE">
104     *
105     * @return A {@code Vector} that is <B><I>exactly</B></I> parallel to the input
106     * {@code Vector<Vector<String>> articleURLs} will be returned.  Each element of each of the
107     * sub-{@code Vector's} in this two-dimensional {@code Vector} will have an instance of the
108     * enumerated-type {@code 'DownloadResult'}.  The constant-value in {@code 'DownloadResult'}
109     * will identify whether or not the {@code Article} pointed to by the {@code URL} at that
110     * {@code Vector}-location successfully downloaded.
111     * <BR /><BR />If the download failed, then the value of the {@code enum 'DownloadResult'} 
112     * will be able to identify the error that occurred when attempting to scrape a particular
113     * news-story {@code URL} 
114     *
115     * @throws PauseException If there is an error when attempting to save the download state.
116     *
117     * @throws ReceiveException If there are any problems with the {@code ScrapedArticleReceiver}
118     * <BR /><BR /><B>NOTE:</B> A {@code ReceiveException} implies that the user's code has failed
119     * to properly handle or save an instance of {@code Article} that has downloaded, successfully,
120     * by this {@code class ScrapeArticles}.  A {@code ReceiveException} will halt the download
121     * process immediately, and download state will be saved if the user has provided a reference
122     * to the {@code Pause} parameter.
123     *
124     * <BR /><BR /><B>NOTE:</B> Other internally caused download-exceptions will be handled and
125     * logged (<I>without halting the entire download-process</I>) - and downloading will continue.  
126     * A note about the internally-produced exception will be printed to the log, and an 
127     * appropriate instance of {@code enum DownloadResult} will be put in the return
128     * {@code Vector}.
129     *
130     * @throws IOException This exception is required for any method that uses Java's
131     * {@code interface java.lang.Appendable}.  Here, the {@code 'Appendable'} is the log, and if
132     * writing to this user provided {@code 'log'} produces an exception, then download progress
133     * will halt immediately, and download state will be saved if the user has provided a reference
134     * to the {@code Pause} parameter.
135     */
136    public static Vector<Vector<DownloadResult>> download(   
137        ScrapedArticleReceiver  articleReceiver,
138        Vector<Vector<String>>  articleURLs,
139        ArticleGet              articleGetter,
140        boolean                 skipArticlesWithoutPhotos,
141        StrFilter               bannerAndAdFinder,   
142        boolean                 keepOriginalPageHTML,
143        Pause                   pause,
144        Appendable              log
145    )
146        throws PauseException, ReceiveException, IOException
147    {
148        log.append(
149            "\n" + C.BRED +
150            "*****************************************************************************************\n" +
151            "*****************************************************************************************\n" + 
152            C.RESET + " Downloading Articles" + C.BRED + "\n" +
153            "*****************************************************************************************\n" +
154            "*****************************************************************************************\n" + 
155            C.RESET + '\n'
156        );
157
158        // The loop variables, and the return-result Vector.
159        int                             outerCounter    = 0;
160        int                             innerCounter    = 0;
161        int                             successCounter  = 0;
162        boolean                         firstIteration  = true;
163        Vector<Vector<DownloadResult>>  ret             = null;
164        URL                             url             = null;
165        Runtime                         rt              = Runtime.getRuntime();
166
167        // If the user has passed an instance of 'pause' then it should be loaded from disk.
168        if (pause != null)
169        {
170            Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r = pause.loadState();
171
172            ret             = r.a;
173            outerCounter    = r.b.intValue();
174            innerCounter    = r.c.intValue();
175            successCounter  = r.d.intValue();
176        }
177
178        // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts
179        // that the download process is starting from the beginning of the article-URL Vector,
180        // THEN a *new vector* should be built.
181        if ((pause == null) || ((outerCounter == 0) && (innerCounter == 0) && (successCounter == 0)))
182        {
183            // Need to instantiate a brand new return vector.  The downloader is starting over
184            // at the beginning of the Article URL list.
185            ret = new Vector<>(articleURLs.size());
186
187            // Initializes the capacity (sizes) of the two-dimensional "Return Vector."
188            // NOTE: The return Vector is exactly parallel to the input "articleURLs" two-dimensional
189            //       input Vector.
190            for (int i=0; i < articleURLs.size(); i++) 
191                ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size()));
192        }
193
194        for (; outerCounter < articleURLs.size(); outerCounter++)
195        {
196            // System.out.println("outerCounter=" + outerCounter + ", innerCounter=" + innerCounter + ", articleURLs.size()=" + articleURLs.size());
197            // System.out.println("articleURLs.elementAt(" + outerCounter + ").size()=" + articleURLs.elementAt(outerCounter).size());
198            for (
199                    innerCounter = (firstIteration ? innerCounter : 0);
200                    innerCounter < articleURLs.elementAt(outerCounter).size();
201                    innerCounter++
202                )
203                try {
204                    firstIteration = false;
205                    String urlStr = articleURLs.elementAt(outerCounter).elementAt(innerCounter);
206
207                    // *******************************************************************************
208                    // Instantiate the URL object from the URLStr String.
209                    // *******************************************************************************
210                    // Should never happen, because each URL will have been tested / instantiated in the previous method.
211                    try
212                        { url = new URL(urlStr); }
213                    catch (Exception e)
214                    {
215                        log.append("Could not instantiate URL-String into URL for [" + urlStr + "].\n");
216                        ret.elementAt(outerCounter).add(DownloadResult.BAD_ARTICLE_URL);
217                        continue;
218                    }
219
220                    // *******************************************************************************
221                    // Run the Garbage Collector, Print Article URL and Number to log.
222                    // *******************************************************************************
223                    rt.gc();
224                    String              freeMem         = StringParse.commas(rt.freeMemory());
225                    String              totalMem        = StringParse.commas(rt.totalMemory());
226
227                    log.append(
228                        "\nVisiting URL: [" +
229                        C.YELLOW +  StringParse.zeroPad10e4(outerCounter) + C.RESET + 
230                        " of " + StringParse.zeroPad10e4(articleURLs.size()) + ", " +
231                        C.YELLOW +  StringParse.zeroPad10e4(innerCounter) + C.RESET + 
232                        " of " + StringParse.zeroPad10e4(articleURLs.elementAt(outerCounter).size()) + "] " +
233                        C.CYAN         + " - "  + url                       + C.RESET + '\n' +
234                        "Available Memory: "    + C.YELLOW +  freeMem       + C.RESET + '\t' +
235                        "Total Memory: "        + C.YELLOW +  totalMem      + C.RESET + '\n'
236                    );
237
238
239                    // *******************************************************************************
240                    // Scrape the web-page
241                    // *******************************************************************************
242                    int                 retryCount      = 0;
243                    Vector<HTMLNode>    page            = null;
244                    while ((page == null) && (retryCount < 5))
245                    try
246                        { page = HTMLPageMWT.getPageTokens(15, TimeUnit.SECONDS, url, false); }
247                    catch (Exception e)
248                        { log.append(HTTPCodes.convertMessageVerbose(e, url, 1) + '\n'); retryCount++; }
249
250
251                    // *******************************************************************************
252                    // Verify the results of scraping the web-page
253                    // *******************************************************************************
254                    if (page == null)
255                    {
256                        log.append(C.BRED + "\tArticle could not download, max 5 retry counts." + C.RESET + '\n');
257                        ret.elementAt(outerCounter).add(DownloadResult.COULD_NOT_DOWNLOAD);
258                        continue;
259                    }
260
261                    if (page.size() == 0)
262                    {
263                        log.append(C.BRED + "\tArticle was retrieved, but page-vector was empty" + C.RESET + '\n');
264                        ret.elementAt(outerCounter).add(DownloadResult.EMPTY_PAGE_VECTOR);
265                        continue;
266                    }
267
268                    log.append("\tPage contains (" + C.YELLOW + page.size() + C.RESET + ") HTMLNodes.\n");
269
270
271                    // *******************************************************************************
272                    // Retrieve the HTML <TITLE> element from the page - if it has one.
273                    // *******************************************************************************
274                    String title = Util.textNodesString(TagNodeGetInclusive.first(page, "title"));
275                    if (title.length() > 0) log.append("\tPage <TITLE> element is: " + C.YELLOW + title + C.RESET + '\n');
276                    else                    log.append("\tPage has no <TITLE> element, or it was empty.\n");
277
278
279                    // *******************************************************************************
280                    // Use the Article Getter to get it, make sure to watch for exceptions.
281                    // *******************************************************************************
282                    Vector<HTMLNode> article = null;
283                    try
284                        { article = articleGetter.apply(url, page); }
285                    catch (ArticleGetException e)
286                    {
287                        log.append(
288                            C.BRED + "\tArticleGet.apply(...) failed: " + e.getMessage() + C.RESET +
289                            "\nException Cause Chain:\n" + EXCC.toString(e) + '\n'
290                        );
291                        ret.elementAt(outerCounter).add(DownloadResult.ARTICLE_GET_EXCEPTION);
292                        continue;
293                    }
294
295
296                    // *******************************************************************************
297                    // Verify the results of article get
298                    // *******************************************************************************
299                    if (article == null)
300                    {
301                        log.append(C.BRED + "\tContent-body not found by ArticleGet.apply(...)\n" + C.RESET);
302                        ret.elementAt(outerCounter).add(DownloadResult.ARTICLE_GET_RETURNED_NULL);
303                        continue;
304                    }
305
306                    if (article.size() == 0)
307                    {
308                        log.append(C.BRED + "\tContent-body not found by ArticleGet.apply(...)\n" + C.RESET);
309                        ret.elementAt(outerCounter).add(DownloadResult.ARTICLE_GET_RETURNED_EMPTY_VECTOR);
310                        continue;
311                    }
312
313                    log.append("\tArticle body contains (" + C.YELLOW + article.size() + C.RESET + ") HTMLNodes.\n");
314
315
316                    // *******************************************************************************
317                    // Retrieve the positions of the images
318                    // *******************************************************************************
319                    int[]               imagePosArr     = InnerTagFind.all(article, "img", "src",
320                                                            (String src) -> ! StrCmpr.startsWithXOR_CI(src.trim(), "data:"));
321                    Vector<URL>         imageURLs       = Links.resolveSRCs(article, imagePosArr, url);
322
323                    if (skipArticlesWithoutPhotos && (imageURLs.size() == 0))
324                    {
325                        log.append(C.BRED + "\tArticle content contained 0 HTML IMG elements" + C.RESET + '\n');
326                        ret.elementAt(outerCounter).add(DownloadResult.NO_IMAGES_FOUND);
327                        continue;
328                    }
329
330                    log.append("\tArticle contains (" + C.YELLOW + imageURLs.size() + C.RESET + ") image TagNodes.\n");
331
332
333                    // *******************************************************************************
334                    // Check the banner-situation.  Count all images, and less that number by the number of "banner-images"
335                    // *******************************************************************************
336                    int imageCount = imageURLs.size();
337                    if (bannerAndAdFinder != null)
338                        for (int pos : imagePosArr)
339                            if (bannerAndAdFinder.test(((TagNode) article.elementAt(pos)).AV("src")))
340                                imageCount--;
341
342                    if (skipArticlesWithoutPhotos && (imageCount == 0))
343                    {
344                        log.append(C.BRED + "\tAll images inside article were banner images" + C.RESET + '\n');
345                        ret.elementAt(outerCounter).add(DownloadResult.NO_IMAGES_FOUND_ONLY_BANNERS);
346                        continue;
347                    }
348
349                    if (bannerAndAdFinder != null)
350                        log.append("\tArticle contains (" + C.YELLOW + imageCount + C.RESET + ") non-banner image TagNodes.\n");
351
352
353                    // *******************************************************************************
354                    // Write the results to the output file
355                    // *******************************************************************************
356                    Article articleResult = new Article
357                        (url, title, (keepOriginalPageHTML ? page : null), article, imageURLs, imagePosArr);
358
359                    // The article was successfully downloaded and parsed.  Send it to the "Receiver" and
360                    // add DownloadResult to the return vector.
361                    log.append(C.GREEN + "ARTICLE LOADED." + C.RESET + "  Sending to ScrapedArticleReceiver.\n");
362                    articleReceiver.receive(articleResult, outerCounter, innerCounter);
363                    ret.elementAt(outerCounter).add(DownloadResult.SUCCESS);
364
365                    successCounter++;
366
367                }
368                catch (ReceiveException re)
369                {
370                    // NOTE: If there was a "ReceiveException" then article-downloading must be halted
371                    // immediately.  A ReceiveException implies that the user did not properly handle
372                    // the downloaded Article, and the user's code would have to be debugged.
373                    log.append(
374                        "There was an error when attempting to pass the downloaded article to the " +
375                        "ArticleReceiver.  Unrecoverable.  Saving download state, and halting download.\n"
376                    );
377
378                    // Make sure to save the internal download state                        
379                    if (pause != null) pause.saveState(ret, outerCounter, innerCounter, successCounter);
380
381                    // Make sure to stop the download process now.  If the article "Receiver" failed to
382                    // save or store a received-article, there is NO POINT IN CONTINUING THE DOWNLOADER.
383                    // NOTE: This will cause the method to exit with error, make sure to stop the "MWT Thread"
384                    //       Remember, this is just a simple "Monitor Thread" that prevents a download
385                    //       from hanging.
386                    HTMLPageMWT.shutdownMWTThreads();
387
388                    throw re;
389                }
390                catch (IOException ioe)
391                {
392                    // This exception occurs if writing the "Appendable" (the log) fails.  If this
393                    // happens, download should halt immediately, and the internal-state should be
394                    // saved to the 'pause' variable.
395                    if (pause != null) pause.saveState(ret, outerCounter, innerCounter, successCounter);
396
397                    // Need to stop the download process.  IOException could ONLY BE the result of the
398                    // "Appendable.append" method.  None of the other commands throw IOException.
399                    // ALSO: If the "Appendable log" never fails (which is 99% likely not to happen),
400                    // This catch-statement will never actually execute.  However, if Appendable.append
401                    // did, in fact, fail - then downloading cannot continue;
402                    // NOTE: This will cause the method to exit with error, make sure to stop the "MWT Thread"
403                    //       Remember, this is just a simple "Monitor Thread" that prevents a download from
404                    //       hanging.
405                    HTMLPageMWT.shutdownMWTThreads();
406
407                    throw ioe;
408                }
409                catch (Exception e)
410                {
411                    // *******************************************************************************
412                    // Handle "Unknown Exception" case.
413                    // *******************************************************************************  
414                    log.append(
415                        "There was an unknown Exception:\n" + EXCC.toString(e) + "\nSkipping URL: " +
416                        url + '\n'
417                    );
418
419                    ret.elementAt(outerCounter).add(DownloadResult.UNKNOWN_EXCEPTION);
420                }
421                finally
422                {
423                    // *******************************************************************************
424                    // Write the current "READ STATE" information (two integers)
425                    // *******************************************************************************                         
426                    if (pause != null) pause.saveState(ret, outerCounter, innerCounter, successCounter);
427                }
428        }
429
430        log.append(
431            C.BRED + "*****************************************************************************************\n" + C.RESET +
432            "Traversing Site Completed.\n" +
433            "Loaded a total of (" + successCounter + ") articles.\n"
434        );
435
436        // Returns the two-dimensional "Download Result" Vector
437        // Make sure to stop the "Max Wait Time Threads"
438        HTMLPageMWT.shutdownMWTThreads();
439
440        return ret;
441    }
442
443}