View Javadoc

1   /*
2    * HtmlDocument.java -- classes to represent HTML documents as parse trees.
3    * Copyright (C) 1999 Quiotix Corporation.  
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License, version 2, as 
7    * published by the Free Software Foundation.  
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13   * for more details.
14   */
15  
16  package com.quiotix.html.parser;
17  
18  import java.util.ArrayList;
19  import java.util.Iterator;
20  import java.util.List;
21  
22  /**
23   * Represents an HTML document as a sequence of elements.  The defined
24   * element types are: Tag, EndTag, TagBlock (matched tag..end tag, with the
25   * intervening elements), Comment, Text, Newline, and Annotation.
26   * <p>
27   * The various element types are defined as nested classes within
28   * HtmlDocument.
29   * </p>
30   * @author Brian Goetz, Quiotix
31   * @see com.quiotix.html.parser.HtmlVisitor
32   */
33  
34  public class HtmlDocument implements Visitable {
35      ElementSequence elements;
36  
37      /** Constructor. */
38      public HtmlDocument(ElementSequence s) {
39          elements = s;
40      }
41  
42      public void accept(HtmlVisitor v) {
43          v.visit(this);
44      }
45  
46      private static String dequote(String s) {
47          if (s == null)
48              return "";
49          if ((s.startsWith("\"") && s.endsWith("\"")) || 
50              (s.startsWith("'") && s.endsWith("'")))
51              return s.substring(1, s.length()-1);
52          else
53              return s;
54      }
55  
56      // The various elements of the HtmlDocument (Tag, EndTag, etc) are included
57      // as nested subclasses largely for reasons of namespace control.
58      // The following subclasses of HtmlElement exist: Tag, EndTag, Text, Comment,
59      // Newline, Annotation, TagBlock.  Also, the additional classes
60      // ElementSequence, Attribute, and AttributeList are defined here as well.
61  
62      // Each subclass of HtmlElement should have a visit() method in the
63      // HtmlVisitor class.
64  
65      /**
66       * Abstract class for HTML elements.  Enforces support for Visitors.
67       */
68      public static abstract class HtmlElement implements Visitable, Sized {
69          public abstract void accept(HtmlVisitor v);
70      }
71  
72      /**
73       * HTML start tag.  Stores the tag name and a list of tag attributes.
74       */
75      public static class Tag extends HtmlElement {
76          /** The name of the tag. */
77          public String tagName;
78          /** A List of the tags Attributes. */
79          public AttributeList attributeList;
80  
81          /** 
82           * Whether the tag has an empty content model  
83           * eg the BR and HR tags.
84           */
85          public boolean emptyTag = false;
86  
87          /** Constructor. */
88          public Tag(String t, AttributeList a) {
89              tagName = t;
90              attributeList = a;
91          }
92  
93          /** Set Tag type to Empty. */
94          public void setEmpty(boolean b) {
95              emptyTag = b;
96          }
97  
98          public void accept(HtmlVisitor v) {
99              v.visit(this);
100         }
101 
102         /** Whether Tag has an Attribute with given name. */
103         public boolean hasAttribute(String name) {
104             return attributeList.contains(name);
105         }
106 
107         /** 
108          * Whether Tag has an Attribute with given name 
109          * and that Attribute has a non-null value. 
110          */
111         public boolean hasAttributeValue(String name) {
112             return attributeList.hasValue(name);
113         }
114 
115         /**
116          * @return the value of the Attribute with the given name or null
117          */
118         public String getAttributeValue(String name) {
119             return attributeList.getValue(name);
120         }
121 
122         public int getLength() {
123             int length = 0;
124             for (Iterator iterator = attributeList.attributes.iterator(); iterator.hasNext();) {
125                 Attribute attribute = (Attribute) iterator.next();
126                 length += 1 + (attribute.getLength());
127             }
128             return length + tagName.length() + 2 + (emptyTag ? 1 : 0);
129         }
130 
131         public String toString() {
132             StringBuffer s = new StringBuffer();
133             s.append("<");
134             s.append(tagName);
135             for (Iterator iterator = attributeList.attributes.iterator(); iterator.hasNext();) {
136                 Attribute attribute = (Attribute) iterator.next();
137                 s.append(" ");
138                 s.append(attribute.toString());
139             }
140             if (emptyTag) s.append("/");
141             s.append(">");
142             return s.toString();
143         }
144     }
145 
146     /**
147      * Html end tag.  Stores only the tag name.
148      */
149     public static class EndTag extends HtmlElement {
150 
151         /** The name of the Tag. */
152         public String tagName;
153 
154         /** Constructor. */
155         public EndTag(String t) {
156             tagName = t;
157         }
158 
159         public void accept(HtmlVisitor v) {
160             v.visit(this);
161         }
162 
163         public int getLength() {
164             return 3 + tagName.length();
165         }
166 
167         public String toString() {
168             return "</" + tagName + ">";
169         }
170     }
171 
172     /**
173      * A tag block is a composite structure consisting of a start tag
174      * a sequence of HTML elements, and a matching end tag.
175      */
176     public static class TagBlock extends HtmlElement {
177         /** Tag at start of Block.*/
178         public Tag startTag;
179         /** Tag at end of Block.*/
180         public EndTag endTag;
181         /** The sequance of elements which make up the body.*/
182         public ElementSequence body;
183 
184         /** Constructor. */
185         public TagBlock(String name, AttributeList aList, ElementSequence b) {
186             startTag = new Tag(name, aList);
187             endTag = new EndTag(name);
188             body = b;
189         }
190 
191         public void accept(HtmlVisitor v) {
192             v.visit(this);
193         }
194         
195         public int getLength() { 
196             int bodyLength = 0;
197             for (Iterator iterator = body.iterator(); iterator.hasNext();) {
198                 HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
199                 bodyLength += htmlElement.getLength();    
200             }
201             return startTag.getLength() + bodyLength + endTag.getLength();
202         }
203         
204         public String toString() {
205           StringBuffer sb = new StringBuffer();
206           sb.append(startTag.toString());
207           for (Iterator iterator = body.iterator(); iterator.hasNext();) {
208             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
209             sb.append(htmlElement.toString());
210           }
211           sb.append(endTag.toString());
212           return sb.toString();
213         }
214         
215         /**
216          * @return the text within a tag block
217          */
218         public String text() {
219           StringBuffer sb = new StringBuffer();
220           for (Iterator iterator = body.iterator(); iterator.hasNext();) {
221             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
222             if (htmlElement instanceof Text) {
223               sb.append(htmlElement.toString());
224             } else if(htmlElement instanceof TagBlock)
225               sb.append(((TagBlock)htmlElement).text());
226           }
227           return sb.toString();
228         }
229     }
230 
231     /**
232      * HTML comments.
233      */
234     public static class Comment extends HtmlElement {
235         /**
236          * Note that a Comment starts and ends with two hyphen characters. 
237          */
238         public String comment;
239 
240         /** Constructor. */
241         public Comment(String c) {
242             comment = c;
243         }
244 
245         public void accept(HtmlVisitor v) {
246             v.visit(this);
247         }
248 
249         public int getLength() {
250             return 3 + comment.length();
251         }
252 
253         public String toString() {
254             return "<!" + comment + ">";
255         }
256     }
257 
258     /**
259      * Plain text
260      */
261     public static class Text extends HtmlElement {
262         /** The text. */
263         public String text;
264 
265         /** Constructor. */
266         public Text(String t) {
267             text = t;
268         }
269 
270         public void accept(HtmlVisitor v) {
271             v.visit(this);
272         }
273 
274         public int getLength() {
275             return text.length();
276         }
277 
278         public String toString() {
279             return text;
280         }
281     }
282 
283     /**
284      * End of line indicator.
285      */
286     public static class Newline extends HtmlElement {
287         /** The system specific newline String. */
288         public static final String NL = System.getProperty("line.separator");
289 
290         public void accept(HtmlVisitor v) {
291             v.visit(this);
292         }
293 
294         public int getLength() {
295             return NL.length();
296         }
297 
298         public String toString() {
299             return NL;
300         }
301     }
302 
303     /**
304      * A sequence of HTML elements.
305      */
306     public static class ElementSequence {
307         private List elements;
308 
309         /** Constructor. */
310         public ElementSequence(int n) {
311             elements = new ArrayList(n);
312         }
313 
314         /** Constructor. */
315         public ElementSequence() {
316             elements = new ArrayList();
317         }
318 
319         /** Add element to list. */
320         public void addElement(HtmlElement o) {
321             elements.add(o);
322         }
323 
324         /**
325          * @return the number of elements in this list.
326          */
327         public int size() {
328             return elements.size();
329         }
330 
331         /**
332          * @return an iterator over the elements in this list in proper sequence.
333          */
334         public Iterator iterator() {
335             return elements.iterator();
336         }
337 
338         /**
339          * Clear current elements and replace with given Collection.
340          * 
341          * @param collection to replace elements with
342          */
343         public void setElements(List collection) {
344             elements.clear();
345             elements.addAll(collection);
346         }
347     }
348 
349     /**
350      * Annotations.  These are not part of the HTML document, but
351      * provide a way for HTML-processing applications to insert
352      * annotations into the document.  These annotations can be used by
353      * other programs or can be brought to the user's attention at a
354      * later time.  For example, the HtmlCollector might insert an
355      * annotation to indicate that there is no corresponding start tag
356      * for an end tag.
357      */
358     public static class Annotation extends HtmlElement {
359         String type, text;
360 
361         /** Constructor. */
362         public Annotation(String type, String text) {
363             this.type = type;
364             this.text = text;
365         }
366 
367         public void accept(HtmlVisitor v) {
368             v.visit(this);
369         }
370 
371         public int getLength() {
372             return 14 + type.length() + text.length();
373         }
374 
375         public String toString() {
376             return "<!--NOTE(" + type + ") " + text + "-->";
377         }
378     }
379 
380     /**
381      * A Tag Attribute.
382      */
383     public static class Attribute implements Sized {
384         /** The name of this Attribute. */
385         public String name;
386         /** The value of this Attribute, including any surrounding quotes. */
387         public String value;
388         /** Whether the Attribute has a value. */
389         public boolean hasValue;
390 
391         /** Constructor. */
392         public Attribute(String n) {
393             name = n;
394             hasValue = false;
395         }
396 
397         /** Constructor. */
398         public Attribute(String n, String v) {
399             name = n;
400             if (v != null) {
401                 value = v;
402                 hasValue = true;
403             }
404         }
405 
406         /** 
407          * Whether quotes are included is dependant upon the source document.
408          * 
409          * {@inheritDoc}
410          * @see com.quiotix.html.parser.Sized#getLength()
411          */
412         public int getLength() {
413             return (hasValue ? name.length() + 1 + value.length() : name.length());
414         }
415 
416         public String toString() {
417             return (hasValue ? name + "=" + value : name);
418         }
419         
420         /**
421          * @return the value with quotes removed
422          */
423         public String getValue() { 
424             return dequote(value);
425         }
426         
427         /**
428          * @param v the value to set, may be null
429          */
430         public void setValue(String v) {
431             value = v;
432             if (v == null)  
433                 hasValue = false;
434             else 
435                 hasValue = true;
436         }
437     }
438 
439     /**
440      * A List of Attributes.
441      */
442     public static class AttributeList {
443         /** The backing List. */
444         public List attributes = new ArrayList();
445 
446         /** Add. */
447         public void addAttribute(Attribute a) {
448             attributes.add(a);
449         }
450 
451         /** Whether the List contains an Attribute with the given name. */
452         public boolean contains(String name) {
453             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
454                 Attribute attribute = (Attribute) iterator.next();
455                 if (attribute.name.equalsIgnoreCase(name))
456                     return true;
457             }
458             return false;
459         }
460 
461         /** 
462          * Whether the List contains an Attribute with the given name 
463          * and that Attribute has a non-null value. 
464          */
465         public boolean hasValue(String name) {
466             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
467                 Attribute attribute = (Attribute) iterator.next();
468                 if (attribute.name.equalsIgnoreCase(name) && attribute.hasValue)
469                     return true;
470             }
471             return false;
472         }
473 
474         /**
475          * @param name the name of the Attribute
476          * @return the value of the Attribute with the given name or null
477          */
478         public String getValue(String name) {
479             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
480                 Attribute attribute = (Attribute) iterator.next();
481                 if (attribute.name.equalsIgnoreCase(name) && attribute.hasValue)
482                     return dequote(attribute.value);
483             }
484             return null;
485         }
486     }
487 }
488 
489 
490