Parser.java

/* dIntProg Browser. A webbrowser written in Java.
 * Copyright (C) 2001 Martin Geisler <gimpster@gimpster.com>
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the
 * 
 * Free Software Foundation, Inc.,
 * 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

import java.net.*;
import java.io.*;
import java.util.*;
import java.awt.*;

/** A parser for HTML. */
public class Parser {

    /* Data */
    private static Stack data = new Stack();
    /* Fonts */
    private static Stack fonts = new Stack();
    /* The current text */
    private static String text = "";
    /* The title */
    private static String title = "";
    /* The current URL */
    private static URL current_url = null;

    /* Should we skip white space? */
    private static boolean ignore_ws = true;
    
    /* The nesting. Used by (Un)ordered lists. */
    private static int nesting = 0;

    /** Parses a HTML document. 
     *  @param url_str the string representation of the URL to be parsed.
     *  @return a {@link DocumentModel}. */
    public static DocumentModel parse(String url_str) { 
        try {
            return parse(new URL(url_str));
        } catch (MalformedURLException e) {
            Browser.debug(e.toString());
            System.exit(1);
            return null;
        }
    }

    /** Parses a HTML document. 
     *  @param url the URL to be parsed.
     *  @return a {@link DocumentModel}. */
    public static DocumentModel parse(URL url) { 
        
        Browser.debug("\n--- Parsing started ---");
        Browser.debug("Parsing " + url);

	DataInputStream stream;
	TokenSequence tokens;
        
	try {
	    stream =
                new DataInputStream(url.openConnection().getInputStream());
	    tokens = new TokenSequence();
	    tokens.tokenize(stream);

	    stream.close();
            
            int count = 1;

            ignore_ws = true;
	    boolean found_body = false;

	    beginBody();

            while (!tokens.empty()) {
                if (tokens.tag() != null && knownTag(tokens.tag().name)) {
                    /***********************
                     *  The token is a tag *
                     ***********************/
                    
                    String tag_name = tokens.tag().name;

		    Browser.debug("Token: <" + tag_name + ">");

                    /* The first thing to do, is to convert any text
                     * found since the previous tag into a
                     * TextFragment. The fragment is then inserted
                     * into the Box at the top of the stack. We do not
                     * pop the font, as it is could be needed later.
                     * */
                    if (text.length() > 0) {
			/* The last tag was <title> */
                        if (tag_name.equals("title")) {
                            title = text;
                            ignore_ws = true;
                        } else {
			    Browser.debug("Creating TextFragment from '" +
					  text + "'");
                            TextFragment tf = 
                                new TextFragment(text,
                                                 (Font)fonts.peek(),
                                                 current_url);
			    if (!(data.peek() instanceof Paragraph)) {
				Browser.debug("Missing Paragraph:" + 
					      (FlexibleBox)data.peek());
				beginParagraph();
			    }
                            ((FlexibleBox)data.peek()).insert(tf);
                        }
                        text = "";
		    }

                    if (tokens.tag().start) {
                        /* A start tag. */                        

                        boolean need_new_font = true;
			if (tag_name.equals("p")  ||
                                   tag_name.equals("h1") ||
                                   tag_name.equals("h2") ||
                                   tag_name.equals("h3")) {
                            beginParagraph();
                        } else if (tag_name.equals("ul")) {
                            beginUnorderedList();
                        } else if (tag_name.equals("ol")) {
                            beginOrderedList();
                        } else if (tag_name.equals("li")) {
                            beginListItem();
                        } else if (tag_name.equals("blockquote")) {
                            beginBlockQuote();
                        } else if (tag_name.equals("table")) {
                            beginTable();
                        } else if (tag_name.equals("tr")) {
                            beginTableRow();
                        } else if (tag_name.equals("td") ||
                                   tag_name.equals("th")) {
                            beginTableData();
                        } else if (tag_name.equals("a")) {
                            ignore_ws = false;
                            try {
                                current_url = new URL(url,
                                tokens.tag().args.lookup("href"));
                            } catch(MalformedURLException e) {
                                Browser.debug("Skipping link to " + 
                                    tokens.tag().args.lookup("href"));
                                current_url = null;
                            }
                        } else if (tag_name.equals("img")) {
                            /* White space is significant around an
                             * image. */
                            ignore_ws = false;
                            need_new_font = false;
                            try {
                                ImageBox img =
                                    new ImageBox(new URL(url,
                                        tokens.tag().args.lookup("src")),
                                        current_url);	
                                Browser.debug("Creating image from " +
                                              tokens.tag().args.lookup("src"));
                                ((FlexibleBox)data.peek()).insert(img);
                                
                            } catch (MalformedURLException e) {
                                Browser.debug("Skipping image " +
                                              tokens.tag().args.lookup("src"));
                            }
                        } else if (tag_name.equals("br")) {
                            ignore_ws = true;
                            need_new_font = false;
                            ((FlexibleBox)data.peek()).insert(new Break());
                        } else if (tag_name.equals("hr")) {
                            ignore_ws = true;
                            need_new_font = false;
                            ((FlexibleBox)data.peek()).insert(
                                new HorizontalRule());                        
                        }

                        if (need_new_font) {
                            /* We push a new font, appropriate for
                             * this tag and the current context. */
                            fonts.push(deriveFont(tag_name,
                                                  (Font)fonts.peek()));
                        }
                    } else {
                        /* Finishing a tag. */
                        if (tag_name.equals("p")  || tag_name.equals("h1") ||
                            tag_name.equals("h2") || tag_name.equals("h3")) {
                            endParagraph();
                        } else if (tag_name.equals("blockquote")) {
                            endBlockQuote();
                        } else if (tag_name.equals("li")) {
                            endListItem();
                        } else if (tag_name.equals("ul")) {
                            endUnorderedList();
                        } else if (tag_name.equals("ol")) {
                            endOrderedList();
                        } else if (tag_name.equals("table")) {
                            endTable();
                        } else if (tag_name.equals("tr")) {
                            endTableRow();
                        } else if (tag_name.equals("td")) {
                            endTableData();
                        } else if (tag_name.equals("a")) {
                            // The link has ended.
                            current_url = null;
                            /* We must not strip the white space that
                             * might come after a <a> tag. */
                            ignore_ws = false;
                        } else if (tag_name.equals("b") ||
				   tag_name.equals("strong") ||
                                   tag_name.equals("i") ||
				   tag_name.equals("em") ||
                                   tag_name.equals("code") ||
                                   tag_name.equals("tt")
                                   ) {
                            /* Inline tags - we must preserve any
                             * white space that follows. */
                            ignore_ws = false;
                        }
                        fonts.pop();
                    }
                    
                } else if (tokens.word() != null) {
                    /************************
                     *  The token is a word *
                     ************************/
                    text = text + tokens.word();
                    ignore_ws = false;
                } else if (tokens.space() && !ignore_ws) {
                    /*************************
                     *  The token is a space *
                     *************************/
                    
                    /* We should add it at the end of our text if the
                     * text doesn't already end with a space. */
                    if (text.length() == 0 ||
                        text.charAt(text.length()-1) != ' ') {
                        text = text + " ";
                    }
                }

                count = count + 1;
                tokens.cut();
            }

            Browser.debug("--- Parsing finished! ---");

	    Browser.debug("Text is '" + text + "'");
	    
	    endBody();

	    debugStack(data);

            if (data.size() == 1) {
                /* There will be exactly one box on the stack if the
                 * syntax of the page was correct. */
                return new DocumentModel((FlexibleBox)data.pop(), title);

            } else {
                String error = "Parse error. There was a problem " +
                    "with the syntax of the document.";
                return new DocumentModel(new ErrorPage(error, url),
                                         "Parse error.");
            }
        } catch (IOException x) {
            return new DocumentModel(new ErrorPage("IOException: " +
						   x.getMessage(), url),
                                     "IOException: " + x.getMessage());
        }
    }

    private static void endContainer() {
        /* This is the box that has just ended. */
        FlexibleBox b = (FlexibleBox)data.pop(); 
        if (!b.isEmpty()) {
            /* We insert the Box into the object now
             * at the top of the stack, which will be
             * container. */
            ((FlexibleBox)data.peek()).insert(b);
        } else {
            Browser.debug("Skipping empty " + b);
        }
        ignore_ws = true;
    }

    private static void beginBody() {
	Browser.debug("Creating Body.");
        data.push(new VerticalBoxStack());
	fonts.push(deriveFont("body", null));
	beginParagraph();
    }

    private static void endBody() {
	endParagraph();
	Browser.debug("Ending Body.");
    }


    private static void beginParagraph() {
        // The can only be one open Paragraph at the top of the stack.
        endParagraph();
        data.push(new Paragraph());
        ignore_ws = true;
    }

    private static void endParagraph() {
	Browser.debug("Ending Paragraph.");
	if (data.peek() instanceof Paragraph) {
	    
	    /* We have to create a the last TextFragment. */
	    if (text.length() > 0) {
		Browser.debug("Creating last TextFragment.");
		TextFragment tf = new TextFragment(text,
						   (Font)fonts.peek(),
						   current_url);
		
		((FlexibleBox)data.peek()).insert(tf);
		text = "";
	    }
	    /* We can only end a Paragraph if it's the box at the top of
	     * the stack. */
	    endContainer();
	}
    }

    private static void beginBlockQuote() {
        /* It is required that a BlockQuote is closed again, so we
         * shouldn't do that without being told. */
        data.push(new BlockQuote());
        /* A BlockQuote starts an implicit Paragraph. */
        beginParagraph();
    }

    private static void endBlockQuote() {
        /* When we close a BlockQuote, we should also close the last
         * Paragraph, if it's still open. */
        endParagraph();
        endContainer();
    }

    private static void beginListItem() {
        // The can only be one open ListItem at the top of the stack.
        endListItem();
        data.push(new ListItem());
        /* A ListItem also starts an implicit Paragraph */
        beginParagraph();
    }

    private static void endListItem() {
        /* When we close a ListItem, we should also close the last
         * Paragraph, if it's still open. */
        endParagraph();
        
        /* We can now try to close the ListItem, if there is one at
         * the top of the stack. */
        if (data.peek() instanceof ListItem) {
            endContainer();
        }
    }

    private static void beginOrderedList() {
	/* When we begin a list, we have to close the last Paragraph,
	 * if it s still open. */
	endParagraph();

        /* It is required that an OrderedList is closed explicit. */
        data.push(new OrderedList(nesting, (Font)fonts.peek()));
        nesting += 1;
        ignore_ws = true;
    }

    private static void endOrderedList() {
        /* When we close an OrderedList, we have to end the last
         * ListItem, if it's still open. */
        endListItem();

        endContainer();
        nesting -= 1;
    }

    private static void beginUnorderedList() {
	/* When we begin a list, we have to close the last Paragraph,
	 * if it s still open. */
	endParagraph();
        
        /* It is required that an UnorderedList is closed explicit. */
        data.push(new UnorderedList(nesting));
        nesting += 1;
        ignore_ws = true;
    }

    private static void endUnorderedList() {
        /* When we close an UnorderedList, we have to end the last
         * ListItem, if it's still open. */
        endListItem();
        
        endContainer();
        nesting -= 1;
    }

    private static void beginTable() {
        /* When we start a table, we should close the last Paragraph,
         * if it's still open. */
        endParagraph();
        data.push(new Table());
        ignore_ws = true;
    }

    private static void endTable() {
        /* We have to close the last TableRow, if it's still open. */
        endTableRow();

        /* We can now close the Table. */
        endContainer();
    }

    private static void beginTableRow() {
        // We have to close the previous TableRow, if it's still open.
        endTableRow();

        //data.push(new TableRow());
        ignore_ws = true;
    }

    private static void endTableRow() {
        /* We have to close the last TableData cell, if it's still
         * open. */
        endTableData();

        /* We can now try to tell the table that a TableRow has ended,
         * if there is a Table at the top of the stack. */
        if (data.peek() instanceof Table) {
            /* We can now close the TableRow. */
            //endContainer();
            ((Table)data.peek()).endTableRow();
        }
        ignore_ws = true;
    }

    private static void beginTableData() {
        /* We have to close the previous TableData cell, if it's still
         * open. */
        endTableData();
        
        data.push(new TableData());
        /* A TableData cell also starts an implicit Paragraph */
        beginParagraph();
    }

    private static void endTableData() {
        /* We have to close the last Paragraph, if it's still open. */
        endParagraph();

        /* We can now try to close the TableRow, if there is one at
         * the top of the stack. */
        if (data.peek() instanceof TableData) {
            /* We can now close the TableData. */
            endContainer();
        }
        ignore_ws = true;
    }

    private static boolean knownTag(String tag) {
        return (tag.equals("body") ||
                tag.equals("title") ||
                tag.equals("h1") ||
                tag.equals("h2") ||
                tag.equals("h3") ||
                tag.equals("p") ||
                tag.equals("b") || tag.equals("strong") ||
                tag.equals("i") || tag.equals("em") ||
                tag.equals("ul") || tag.equals("ol") ||
                tag.equals("li") ||
                tag.equals("blockquote") ||
		tag.equals("img") ||
                tag.equals("a") ||
                tag.equals("table") || tag.equals("tr") ||
                tag.equals("td") || tag.equals("th") ||
                tag.equals("br") ||
                tag.equals("hr") ||
                tag.equals("code") || tag.equals("tt") ||
		tag.equals("pre"));
    }


    private static void debugStack(Stack s) {
        Stack tmp = new Stack();
        while (s.size() > 0) {
            Browser.debug(s.peek().toString());
            tmp.push(s.pop());
        }

        while (tmp.size() > 0) {
            s.push(tmp.pop());
        }
    }
    

    private static Font deriveFont(String tag, Font current) {

        Font derived = null;
        if (tag.equals("i") || tag.equals("em")) {
            /* We turn italics on if it was off, and vice versa. */
            if (current.isItalic()) {
                derived = current.deriveFont(current.getStyle() - Font.ITALIC);
            } else {
                derived = current.deriveFont(current.getStyle() + Font.ITALIC);
            }
        } else if (tag.equals("b") || tag.equals("strong") ||
                   tag.equals("th")) {
            /* We turn on bold unconditionally */
            if (!current.isBold()) {
                derived = current.deriveFont(current.getStyle() + Font.BOLD);
            } else {
                derived = current.deriveFont(current.getStyle());
            }
        } else if (tag.equals("h1")) {
            derived = current.deriveFont((float)24.0);
        } else if (tag.equals("h2")) {
            derived = current.deriveFont((float)18.0);
        } else if (tag.equals("h3")) {
            if (current.isItalic()) {
                derived = current.deriveFont(current.getStyle() - Font.ITALIC,
                                             (float)16.0);
            } else {
                derived = current.deriveFont(current.getStyle() + Font.ITALIC,
                                             (float)16.0);
            }
        } else if (tag.equals("code") ||
                   tag.equals("tt") ||
                   tag.equals("pre")) {
            derived = new Font("Monospaced",
                               current.getStyle(),
                               current.getSize());
        } else if (current == null) {
            derived = new Font(null, Font.PLAIN, 14);
        } else {
            derived = current;
        }
        
        return derived;
    }

    private static class TokenSequence {
	Vector v;
	char c;
	Token t;
    
	public TokenSequence cut() {
	    if (!v.isEmpty()) v.removeElementAt(0);
	    //    v.removeElementAt(0);
	    return this;
	}
    
	public boolean empty() {
	    return v.isEmpty();
	}
    
	public boolean space() {
	    return ((Token)v.elementAt(0)).space;
	}
    
	public String word() {
	    return ((Token)v.elementAt(0)).word;
	}
    
	public Tag tag() {
	    return ((Token)v.elementAt(0)).tag;
	}
    
	private void get(DataInputStream stream) throws IOException {
	    int b;
	    b = stream.readByte();
	    if (b<0) b+=256;
	    c = (char)b;
	}
    
	private void skipBlanks(DataInputStream stream) throws IOException {
	    while (c==' ' || c == '\t' || c == '\n' || c == '\r') get(stream);
	}
    
	private void readWord(DataInputStream stream) throws IOException {
	    StringBuffer b = new StringBuffer();
	    while (c!=' ' && c!= '\t' && c!='\n' &&
                   c!='<' && c!='&' && c!='\r') {
		b.append(c);
		get(stream);
	    }
	    t.word = b.toString();
	}
    
	private String readName(DataInputStream stream) {
	    StringBuffer b = new StringBuffer();
	    try {
		if (c=='"') { 
		    get(stream);
		    while (c!='"') {
			b.append(c);
			get(stream);
		    }
		    get(stream);
		} else {
		    while (c != ' ' && c != '\t' && c != '\n' &&
                           c != '>' && c != '='  && c != '\r') {
			b.append(c); 
			get(stream);
		    }
		}
	    } catch (IOException x) {
                return b.toString();
            }
	    return b.toString();
	}

	private void readTag(DataInputStream stream) throws IOException {
	    StringBuffer b = new StringBuffer();
	    String left,right;
	    t.tag = new Tag();
	    skipBlanks(stream);
	    if (c == '/') {
		t.tag.start = false;
		get(stream);
	    }
	    skipBlanks(stream);
	    t.tag.name = readName(stream).toLowerCase();
	    skipBlanks(stream);
	    while (c != '>') {
		left = readName(stream).toLowerCase();
		skipBlanks(stream);
		if (c == '=') {
		    get(stream);
		    skipBlanks(stream);
		    right = readName(stream);
		} else {
		    right = null;
		}
		t.tag.args.add(left, right);
		skipBlanks(stream);
	    }
	    get(stream);
	    if (t.tag.start) skipBlanks(stream);
	}
    
	private void readSpecial(DataInputStream stream) {
	    StringBuffer b = new StringBuffer();
	    String s;
	    try {
		get(stream);
		if (c == ' ') {
		    t.word = "&";
		    return;
		}
		if (c == '#') {
		    get(stream);
		    while (c != ';') {
			b.append(c);
			get(stream);
		    }
		    get(stream);
		    s = b.toString();
		    try {
			int i;
			i = Integer.parseInt(s);
			char a[] = { (char)i };
			t.word = new String(a);
		    } catch (NumberFormatException x) {
			t.word = "&#" + s +";";
		    }
		    return;
		}
		while (c != ';') {
		    b.append(c);
		    get(stream);
		}
		get(stream);
		s = b.toString();
		if (s.equals("amp"))    { t.word = "&"; return; }
		if (s.equals("gt"))     { t.word = ">"; return; }
		if (s.equals("lt"))     { t.word = "<"; return; }
		if (s.equals("quot"))   { t.word = "\""; return; }
		if (s.equals("nbsp"))   { t.word = " "; return; }
		if (s.equals("aring"))  { t.word = "å"; return; }
		if (s.equals("Aring"))  { t.word = "Å"; return; }
		if (s.equals("oslash")) { t.word = "ø"; return; }
		if (s.equals("Oslash")) { t.word = "Ø"; return; }
		if (s.equals("aelig"))  { t.word = "æ"; return; }
		if (s.equals("AElig"))  { t.word = "Æ"; return; }
		if (s.equals("copy"))   { t.word = "�"; return; }
                if (s.equals("reg"))    { t.word = "�"; return; }
                if (s.equals("aacute")) { t.word = "á"; return; }
                if (s.equals("agrave")) { t.word = "à"; return; }
                if (s.equals("Aacute")) { t.word = "Á"; return; }
                if (s.equals("Agrave")) { t.word = "À"; return; }
                if (s.equals("eacute")) { t.word = "é"; return; }
                if (s.equals("egrave")) { t.word = "è"; return; }
                if (s.equals("Eacute")) { t.word = "É"; return; }
                if (s.equals("Egrave")) { t.word = "È"; return; }
		t.word = "&" + s +";";
	    } catch (IOException x) {}
	}
    
	public void skipMeta(DataInputStream stream) throws IOException {
	    boolean zero = true;
	    boolean even = true;
	    t = null;
	    while (true) {
		get(stream);
		if (c=='-') {
		    if (zero) zero=false;
		    else {
			zero = true;
			even = !even;
		    }
		} else if (c == '>') {
		    if (even) {
			get(stream);
			return;
		    } else zero = true;
		} else zero = true;
	    }
	}
    
	public TokenSequence() {
	    v = new Vector(0);
	}
    
	public void tokenize(DataInputStream stream) {
	    try {
		get(stream);
		while (true) {
		    t = new Token();
		    switch (c) {
		    case '\r':
		    case ' ':
		    case '\t':
		    case '\n': skipBlanks(stream);
			t.space = true;
			break;
		    case '<':  get(stream);
			if (c == '!') skipMeta(stream);
			else readTag(stream);
			break;
		    case '&':  readSpecial(stream);
			break;
		    default:   readWord(stream);
			break;
		    }
		    if (t!=null) v.addElement(t);
		}
	    } catch (IOException x) {
		return;
	    }
	}
    
	private class Token {
	    public String word;
	    public Tag tag;
	    public boolean space;
      
	    public Token() {
		word = null;
		tag = null;
		space = false;
	    }
	}
    
	private class Tag{
	    public String name;
	    public boolean start;
	    public Arguments args;
      
	    public Tag() {
		this.name = null;
		this.start = true;
		this.args = new Arguments();
	    }

	}
    
	private class Arguments {
	    String left;
	    String right;
	    Arguments next;
	    boolean empty;
      
	    public Arguments() {
		empty = true;
		next = null;
	    }
      
	    public Arguments(String left, String right, Arguments next) {
		this.left = left;
		this.right = right;
		this.next = next;
		this.empty = false;
	    }
      
	    public void add(String left, String right) {
		if (!empty) this.next = new Arguments(this.left,
                                                      this.right,
                                                      this.next);
		this.left = left;
		this.right = right;
		this.empty = false;
	    }
      
	    public String lookup(String left) {
		if (empty) return null;
		if (left.equals(this.left)) return this.right;
		if (next == null) return null;
		return next.lookup(left);
	    }
	}
    }

}