001package Torello.HTML;
002
003import java.io.*;
004import java.util.Vector;
005import java.net.URL;
006import Torello.HTML.HTMLPage.Parser;
007
008import java.util.concurrent.*;
009import java.util.concurrent.locks.*;
010
011import Torello.HTML.Tools.JavaDoc.StaticFunctional;
012import Torello.HTML.Tools.JavaDoc.Excuse;
013
014/**
015 * <CODE>HTML Page Parser (with Max Wait Time Feature) - Documentation.</CODE><BR /><BR />
016 * <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMWT">
017 * <EMBED CLASS="external-html" DATA-FILE-ID="HTMLP">
018 * @see Scrape#getHTML(BufferedReader, int, int)
019 * @see Scrape#getHTML(BufferedReader, String, String)
020 * @see HTMLPage
021 */
022@StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
023public class HTMLPageMWT
024{
025    private HTMLPageMWT() { }
026
027    /**
028     * If needing to "swap a proprietary parser" comes up, this is possible.
029     * It just needs to accept the same parameters as the current parser, and produce a 
030     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
031     * parser has been tested and happens to be generating different results, it can be easily
032     * 'swapped out' for the one used now.
033     * @see HTMLPage.Parser;
034     * @see HTMLPage.Parser#parse;
035     */
036    public static Parser parser = Torello.HTML.parse.ParserRE::parsePageTokens;
037
038    // ***************************************************************************************************
039    // These 6 functions presume that the HTML source is from a URL
040    // ***************************************************************************************************
041    
042    /**
043     * Parses and Vectorizes HTML from a URL source.
044     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
045     * time-limit is exceeded.
046     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
047     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
048     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
049     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
050     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
051     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
052     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
053     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
054     */
055    public static Vector<HTMLNode> getPageTokens(
056        long timeout, TimeUnit unit,
057        URL url, boolean eliminateHTMLTags
058    )
059        throws IOException, InterruptedException
060    {
061        return getPageTokens(timeout, unit, url, eliminateHTMLTags, null, null, null, null, null);
062    }
063    
064    /**
065     * Parses and Vectorizes HTML from a URL source.
066     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
067     * time-limit is exceeded.
068     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
069     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
070     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
071     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
072     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPET">
073     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPST">
074     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
075     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX2">
076     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
077     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
078     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
079     */ 
080    public static Vector<HTMLNode> getPageTokens(
081        long timeout, TimeUnit unit,
082        URL url, boolean eliminateHTMLTags,
083        String startTag, String endTag
084    )
085        throws IOException, InterruptedException
086    {
087        return getPageTokens
088            (timeout, unit, url, eliminateHTMLTags, startTag, endTag, null, null, null);
089    }
090    
091    /**
092     * Parses and Vectorizes HTML from a URL source.
093     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
094     * time-limit is exceeded.
095     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
096     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
097     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
098     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
099     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELN">
100     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSLN">
101     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
102     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIAEX">
103     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX1">
104     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
105     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
106     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
107     */
108    public static Vector<HTMLNode> getPageTokens(
109        long timeout, TimeUnit unit,
110        URL url, boolean eliminateHTMLTags,
111        int startLineNum, int endLineNum
112    )
113        throws IOException, InterruptedException
114    {
115        return getPageTokens
116            (timeout, unit, url, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
117    }
118
119    /**
120     * Parses and Vectorizes HTML from a URL source.
121     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
122     * time-limit is exceeded.
123     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
124     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
125     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
126     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
127     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
128     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
129     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
130     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
131     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
132     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
133     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
134     */
135    public static Vector<HTMLNode> getPageTokens(
136        long timeout, TimeUnit unit,
137        URL url, boolean eliminateHTMLTags,
138        String rawHTMLFile, String matchesFile, String justTextFile
139    )
140        throws IOException, InterruptedException
141    {
142        return getPageTokens(
143            timeout, unit, url, eliminateHTMLTags, null, null,
144            rawHTMLFile, matchesFile, justTextFile
145        );
146    }
147
148    // ***************************************************************************************************
149    // The next 6 functions that follow presume that the input is in the form of a Java.util.BufferedReader
150    // ***************************************************************************************************
151
152    /**
153     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
154     * Spawns a <I>monitor-thread</I> that stops the download if a 
155     * certain, user-specified, time-limit is exceeded.
156     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
157     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
158     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
159     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
160     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
161     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
162     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
163     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
164     */
165    public static Vector<HTMLNode> getPageTokens(
166        long timeout, TimeUnit unit,
167        BufferedReader br, boolean eliminateHTMLTags
168    )
169        throws IOException, InterruptedException
170    {
171        return getPageTokens
172            (timeout, unit, br, eliminateHTMLTags, null, null, null, null, null);
173    }
174
175    /**
176     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
177     * Spawns a <I>monitor-thread</I> that stops the download if a 
178     * certain, user-specified, time-limit is exceeded.
179     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
180     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
181     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
182     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
183     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPET">
184     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPST">
185     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
186     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX2">
187     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
188     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
189     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
190     */ 
191    public static Vector<HTMLNode> getPageTokens(
192        long timeout, TimeUnit unit,
193        BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag
194    )
195        throws IOException, InterruptedException
196    {
197        return getPageTokens
198            (timeout, unit, br, eliminateHTMLTags, startTag, endTag, null, null, null);
199    }
200
201    /**
202     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
203     * Spawns a <I>monitor-thread</I> that stops the download if a 
204     * certain, user-specified, time-limit is exceeded.
205     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
206     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
207     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
208     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
209     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELN">
210     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSLN">
211     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
212     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIAEX">
213     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX1">
214     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
215     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
216     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
217     */
218    public static Vector<HTMLNode> getPageTokens(
219        long timeout, TimeUnit unit,
220        BufferedReader br, boolean eliminateHTMLTags,
221        int startLineNum, int endLineNum
222    )
223        throws IOException, InterruptedException
224    {
225        return getPageTokens
226            (timeout, unit, br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
227    }
228
229    /**
230     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
231     * Spawns a <I>monitor-thread</I> that stops the download if a 
232     * certain, user-specified, time-limit is exceeded.
233     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
234     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
235     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
236     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
237     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
238     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
239     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
240     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
241     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
242     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
243     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
244     */
245    public static Vector<HTMLNode> getPageTokens(
246        long timeout, TimeUnit unit,
247        BufferedReader br, boolean eliminateHTMLTags,
248        String rawHTMLFile, String matchesFile, String justTextFile
249    )
250        throws IOException, InterruptedException
251    { 
252        return getPageTokens
253            (timeout, unit, br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
254    }
255
256    // ***************************************************************************************************
257    // * Receives a "pre-instantiated" BufferedReader for the HTML Source parameter
258    // ***************************************************************************************************
259
260    private static final ExecutorService    executor    = Executors.newCachedThreadPool();
261    private static final Lock               lock        = new ReentrantLock();
262
263    /**
264     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
265     * you might see your Java-Program hang for a few seconds when you would expect it to exit back to your O.S. normally.
266     *
267     * <BR /><BR /><B>Max Wait Time</B> operates by building a "Timeout &amp; Monitor" thread, and therefore when a program you
268     * have written yourself reaches the end of its code, <I><B>if you have performed any Internet-Downloads using
269     * {@code class HTMLPageMWT}</B></I>, then your program <I>might not exit immediately,</I> but rather sit at the
270     * command-prompt for anywhere between 10 and 30 seconds before this Timeout-Thread, created in class HTMLPageMWT, dies.
271     *
272     * <BR /><BR /><B><SPAN STYLE="color: red">MULTI-THREADED:</B></SPAN> You may immediately terminate any additional
273     * threads that were started using this method.
274     */
275    public static void shutdownMWTThreads() { executor.shutdownNow(); }
276
277    /**
278     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
279     * Spawns a <I>monitor-thread</I> that stops the download if a 
280     * certain, user-specified, time-limit is exceeded.
281     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
282     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
283     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
284     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
285     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPET">
286     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPST">
287     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
288     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
289     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
290     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
291     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX2">
292     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
293     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
294     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
295     */
296    public static Vector<HTMLNode> getPageTokens(
297        long timeout, TimeUnit unit,
298        BufferedReader br, boolean eliminateHTMLTags,
299        String startTag, String endTag,
300        String rawHTMLFile, String matchesFile, String justTextFile
301    )
302        throws IOException, InterruptedException
303    {
304        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
305        {
306            public Vector<HTMLNode> call() throws Exception
307            {
308                return parser.parse(
309                    Scrape.getHTML(br, startTag, endTag),
310                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
311                );
312            }
313        };
314
315        lock.lock();
316        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
317        lock.unlock();
318
319        try
320            { return future.get(timeout, unit); }
321        catch (TimeoutException e)
322            { return null; }
323        catch (ExecutionException e)
324        {
325            Throwable originalException = e.getCause();
326            if (originalException == null) throw new RejectedExecutionException(
327                "An Execution Exception was thrown, but it did provide a cause throwable " +
328                "(e.getCause() returned null).  See this exception's getCause() method to " +
329                "view the ExecutionException that has occurred.",
330                e
331            );
332
333            if (originalException instanceof IOException)
334                throw (IOException) originalException;
335
336            if (originalException instanceof RuntimeException)
337                throw (RuntimeException) originalException;
338
339            throw new RejectedExecutionException(
340                "An Execution Exception occurred, but it was neither a RuntimeException, " +
341                "nor IOException.  See this exception's getCause() method to view the " +
342                "underlying error that has occurred.", originalException
343            );
344        }
345    }
346
347    /**
348     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
349     * Spawns a <I>monitor-thread</I> that stops the download if a 
350     * certain, user-specified, time-limit is exceeded.
351     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
352     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
353     * @param br <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPBR">
354     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
355     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELN">
356     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSLN">
357     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
358     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
359     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
360     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
361     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIAEX">
362     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX1">
363     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
364     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
365     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
366     */
367    public static Vector<HTMLNode> getPageTokens(
368        long timeout, TimeUnit unit,
369        BufferedReader br, boolean eliminateHTMLTags,
370        int startLineNum, int endLineNum,
371        String rawHTMLFile, String matchesFile, String justTextFile
372    )
373        throws IOException, InterruptedException
374    {
375        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
376        {
377            public Vector<HTMLNode> call() throws Exception
378            {
379                return parser.parse(
380                    Scrape.getHTML(br, startLineNum, endLineNum),
381                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
382                );
383            }
384        };
385
386        lock.lock();
387        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
388        lock.unlock();
389
390        try
391            { return future.get(timeout, unit); }
392        catch (TimeoutException e)
393            { return null; }
394        catch (ExecutionException e)
395        {
396            Throwable originalException = e.getCause();
397
398            if (originalException == null) throw new RejectedExecutionException(
399                "An Execution Exception was thrown, but it did provide a cause throwable " +
400                "(e.getCause() returned null).  See this exception's getCause() method to " +
401                "view the ExecutionException has that occurred.",
402                e
403            );
404
405            if (originalException instanceof IOException)
406                throw (IOException) originalException;
407
408            if (originalException instanceof RuntimeException)
409                throw (RuntimeException) originalException;
410
411            throw new RejectedExecutionException(
412                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
413                "IOException.  See this exception's getCause() method to view the underlying " +
414                "error that has occurred.", originalException
415            );
416        }
417    }
418
419    // ***************************************************************************************************
420    // * Receives a java.net.URL for the HTML Source parameter, which could Timeout/Hang - so it must
421    // * be opened within the Multi-Threaded "Timeout" code (and therefore requires a second version of
422    // * these two methods - where Scrape.openConn(url) is *inside* the monitored downloading thread.
423    // ***************************************************************************************************
424    
425    /**
426     * Parses and Vectorizes HTML from a URL source.
427     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
428     * time-limit is exceeded.
429     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
430     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
431     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
432     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
433     * @param startTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPET">
434     * @param endTag <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPST">
435     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
436     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
437     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
438     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
439     * @throws ScrapeException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX2">
440     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
441     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
442     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
443     */
444    public static Vector<HTMLNode> getPageTokens(
445        long timeout, TimeUnit unit,
446        URL url, boolean eliminateHTMLTags,
447        String startTag, String endTag,
448        String rawHTMLFile, String matchesFile, String justTextFile
449    )
450        throws IOException, InterruptedException
451    {
452        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
453        {
454            public Vector<HTMLNode> call() throws Exception
455            { 
456                return parser.parse(
457                    Scrape.getHTML(Scrape.openConn(url), startTag, endTag),
458                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
459                );
460            }
461        };
462
463        lock.lock();
464        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
465        lock.unlock();
466
467        try
468            { return future.get(timeout, unit); }
469        catch (TimeoutException e)
470            { return null; }
471        catch (ExecutionException e)
472        {
473            Throwable originalException = e.getCause();
474
475            if (originalException == null) throw new RejectedExecutionException(
476                "An Execution Exception was thrown, but it did provide a cause throwable " +
477                "(e.getCause() returned null).  See this exception's getCause() method to " +
478                "view the ExecutionException that has occurred.", e
479            );
480
481            if (originalException instanceof IOException)
482                throw (IOException) originalException;
483
484            if (originalException instanceof RuntimeException)
485                throw (RuntimeException) originalException;
486
487            throw new RejectedExecutionException(
488                "An Execution Exception occurred, but it was neither a RuntimeException, " +
489                "nor IOException.  See this exception's getCause() method to view the " +
490                "underlying error that has occurred.", originalException
491            );
492        }
493    }
494
495    
496    /**
497     * Parses and Vectorizes HTML from a URL source.
498     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
499     * time-limit is exceeded.
500     * @param timeout <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPTO">
501     * @param unit <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPUNIT">
502     * @param url <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPURL">
503     * @param eliminateHTMLTags <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELIMT">
504     * @param startLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPELN">
505     * @param endLineNum <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSLN">
506     * @param rawHTMLFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRAW">
507     * @param matchesFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPMATCH">
508     * @param justTextFile <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPJTEXT">
509     * @return <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPRET">
510     * @throws IllegalArgumentException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIAEX">
511     * @throws ScrapeException  <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPSCEX1">
512     * @throws IOException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIOEX">
513     * @throws InterruptedException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPIEX">
514     * @throws RejectedExecutionException <EMBED CLASS="external-html" DATA-FILE-ID="HTMLPREEX">
515     */
516    public static Vector<HTMLNode> getPageTokens(
517        long timeout, TimeUnit unit,
518        URL url, boolean eliminateHTMLTags,
519        int startLineNum, int endLineNum,
520        String rawHTMLFile, String matchesFile, String justTextFile
521    )
522        throws IOException, InterruptedException
523    {
524        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
525        {
526            public Vector<HTMLNode> call() throws Exception
527            { 
528                return parser.parse(
529                    Scrape.getHTML(Scrape.openConn(url), startLineNum, endLineNum),
530                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
531                );
532            }
533        };
534
535        lock.lock();
536        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
537        lock.unlock();
538
539        try
540            { return future.get(timeout, unit); }
541        catch (TimeoutException e)
542            { return null; }
543        catch (ExecutionException e)
544        {
545            Throwable originalException = e.getCause();
546    
547            if (originalException == null) throw new RejectedExecutionException(
548                "An Execution Exception was thrown, but it did provide a cause throwable " +
549                "(e.getCause() returned null).  See this exception's getCause() method to " +
550                "view the ExecutionException has that occurred.",
551                e
552            );
553
554            if (originalException instanceof IOException)
555                throw (IOException) originalException;
556
557            if (originalException instanceof RuntimeException)
558                throw (RuntimeException) originalException;
559
560            throw new RejectedExecutionException(
561                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
562                "IOException.  See this exception's getCause() method to view the underlying " +
563                "error that has occurred.", originalException
564            );
565        }
566    }
567}