/* dIntProg Browser. A webbrowser written in Java. * Copyright (C) 2001 Martin Geisler <gimpster@gimpster.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ import java.net.*; import java.io.*; import java.util.*; import java.awt.*; /** A parser for HTML. */ public class Parser { /* Data */ private static Stack data = new Stack(); /* Fonts */ private static Stack fonts = new Stack(); /* The current text */ private static String text = ""; /* The title */ private static String title = ""; /* The current URL */ private static URL current_url = null; /* Should we skip white space? */ private static boolean ignore_ws = true; /* The nesting. Used by (Un)ordered lists. */ private static int nesting = 0; /** Parses a HTML document. * @param url_str the string representation of the URL to be parsed. * @return a {@link DocumentModel}. */ public static DocumentModel parse(String url_str) { try { return parse(new URL(url_str)); } catch (MalformedURLException e) { Browser.debug(e.toString()); System.exit(1); return null; } } /** Parses a HTML document. * @param url the URL to be parsed. * @return a {@link DocumentModel}. */ public static DocumentModel parse(URL url) { Browser.debug("\n--- Parsing started ---"); Browser.debug("Parsing " + url); DataInputStream stream; TokenSequence tokens; try { stream = new DataInputStream(url.openConnection().getInputStream()); tokens = new TokenSequence(); tokens.tokenize(stream); stream.close(); int count = 1; ignore_ws = true; boolean found_body = false; beginBody(); while (!tokens.empty()) { if (tokens.tag() != null && knownTag(tokens.tag().name)) { /*********************** * The token is a tag * ***********************/ String tag_name = tokens.tag().name; Browser.debug("Token: <" + tag_name + ">"); /* The first thing to do, is to convert any text * found since the previous tag into a * TextFragment. The fragment is then inserted * into the Box at the top of the stack. We do not * pop the font, as it is could be needed later. * */ if (text.length() > 0) { /* The last tag was <title> */ if (tag_name.equals("title")) { title = text; ignore_ws = true; } else { Browser.debug("Creating TextFragment from '" + text + "'"); TextFragment tf = new TextFragment(text, (Font)fonts.peek(), current_url); if (!(data.peek() instanceof Paragraph)) { Browser.debug("Missing Paragraph:" + (FlexibleBox)data.peek()); beginParagraph(); } ((FlexibleBox)data.peek()).insert(tf); } text = ""; } if (tokens.tag().start) { /* A start tag. */ boolean need_new_font = true; if (tag_name.equals("p") || tag_name.equals("h1") || tag_name.equals("h2") || tag_name.equals("h3")) { beginParagraph(); } else if (tag_name.equals("ul")) { beginUnorderedList(); } else if (tag_name.equals("ol")) { beginOrderedList(); } else if (tag_name.equals("li")) { beginListItem(); } else if (tag_name.equals("blockquote")) { beginBlockQuote(); } else if (tag_name.equals("table")) { beginTable(); } else if (tag_name.equals("tr")) { beginTableRow(); } else if (tag_name.equals("td") || tag_name.equals("th")) { beginTableData(); } else if (tag_name.equals("a")) { ignore_ws = false; try { current_url = new URL(url, tokens.tag().args.lookup("href")); } catch(MalformedURLException e) { Browser.debug("Skipping link to " + tokens.tag().args.lookup("href")); current_url = null; } } else if (tag_name.equals("img")) { /* White space is significant around an * image. */ ignore_ws = false; need_new_font = false; try { ImageBox img = new ImageBox(new URL(url, tokens.tag().args.lookup("src")), current_url); Browser.debug("Creating image from " + tokens.tag().args.lookup("src")); ((FlexibleBox)data.peek()).insert(img); } catch (MalformedURLException e) { Browser.debug("Skipping image " + tokens.tag().args.lookup("src")); } } else if (tag_name.equals("br")) { ignore_ws = true; need_new_font = false; ((FlexibleBox)data.peek()).insert(new Break()); } else if (tag_name.equals("hr")) { ignore_ws = true; need_new_font = false; ((FlexibleBox)data.peek()).insert( new HorizontalRule()); } if (need_new_font) { /* We push a new font, appropriate for * this tag and the current context. */ fonts.push(deriveFont(tag_name, (Font)fonts.peek())); } } else { /* Finishing a tag. */ if (tag_name.equals("p") || tag_name.equals("h1") || tag_name.equals("h2") || tag_name.equals("h3")) { endParagraph(); } else if (tag_name.equals("blockquote")) { endBlockQuote(); } else if (tag_name.equals("li")) { endListItem(); } else if (tag_name.equals("ul")) { endUnorderedList(); } else if (tag_name.equals("ol")) { endOrderedList(); } else if (tag_name.equals("table")) { endTable(); } else if (tag_name.equals("tr")) { endTableRow(); } else if (tag_name.equals("td")) { endTableData(); } else if (tag_name.equals("a")) { // The link has ended. current_url = null; /* We must not strip the white space that * might come after a <a> tag. */ ignore_ws = false; } else if (tag_name.equals("b") || tag_name.equals("strong") || tag_name.equals("i") || tag_name.equals("em") || tag_name.equals("code") || tag_name.equals("tt") ) { /* Inline tags - we must preserve any * white space that follows. */ ignore_ws = false; } fonts.pop(); } } else if (tokens.word() != null) { /************************ * The token is a word * ************************/ text = text + tokens.word(); ignore_ws = false; } else if (tokens.space() && !ignore_ws) { /************************* * The token is a space * *************************/ /* We should add it at the end of our text if the * text doesn't already end with a space. */ if (text.length() == 0 || text.charAt(text.length()-1) != ' ') { text = text + " "; } } count = count + 1; tokens.cut(); } Browser.debug("--- Parsing finished! ---"); Browser.debug("Text is '" + text + "'"); endBody(); debugStack(data); if (data.size() == 1) { /* There will be exactly one box on the stack if the * syntax of the page was correct. */ return new DocumentModel((FlexibleBox)data.pop(), title); } else { String error = "Parse error. There was a problem " + "with the syntax of the document."; return new DocumentModel(new ErrorPage(error, url), "Parse error."); } } catch (IOException x) { return new DocumentModel(new ErrorPage("IOException: " + x.getMessage(), url), "IOException: " + x.getMessage()); } } private static void endContainer() { /* This is the box that has just ended. */ FlexibleBox b = (FlexibleBox)data.pop(); if (!b.isEmpty()) { /* We insert the Box into the object now * at the top of the stack, which will be * container. */ ((FlexibleBox)data.peek()).insert(b); } else { Browser.debug("Skipping empty " + b); } ignore_ws = true; } private static void beginBody() { Browser.debug("Creating Body."); data.push(new VerticalBoxStack()); fonts.push(deriveFont("body", null)); beginParagraph(); } private static void endBody() { endParagraph(); Browser.debug("Ending Body."); } private static void beginParagraph() { // The can only be one open Paragraph at the top of the stack. endParagraph(); data.push(new Paragraph()); ignore_ws = true; } private static void endParagraph() { Browser.debug("Ending Paragraph."); if (data.peek() instanceof Paragraph) { /* We have to create a the last TextFragment. */ if (text.length() > 0) { Browser.debug("Creating last TextFragment."); TextFragment tf = new TextFragment(text, (Font)fonts.peek(), current_url); ((FlexibleBox)data.peek()).insert(tf); text = ""; } /* We can only end a Paragraph if it's the box at the top of * the stack. */ endContainer(); } } private static void beginBlockQuote() { /* It is required that a BlockQuote is closed again, so we * shouldn't do that without being told. */ data.push(new BlockQuote()); /* A BlockQuote starts an implicit Paragraph. */ beginParagraph(); } private static void endBlockQuote() { /* When we close a BlockQuote, we should also close the last * Paragraph, if it's still open. */ endParagraph(); endContainer(); } private static void beginListItem() { // The can only be one open ListItem at the top of the stack. endListItem(); data.push(new ListItem()); /* A ListItem also starts an implicit Paragraph */ beginParagraph(); } private static void endListItem() { /* When we close a ListItem, we should also close the last * Paragraph, if it's still open. */ endParagraph(); /* We can now try to close the ListItem, if there is one at * the top of the stack. */ if (data.peek() instanceof ListItem) { endContainer(); } } private static void beginOrderedList() { /* When we begin a list, we have to close the last Paragraph, * if it s still open. */ endParagraph(); /* It is required that an OrderedList is closed explicit. */ data.push(new OrderedList(nesting, (Font)fonts.peek())); nesting += 1; ignore_ws = true; } private static void endOrderedList() { /* When we close an OrderedList, we have to end the last * ListItem, if it's still open. */ endListItem(); endContainer(); nesting -= 1; } private static void beginUnorderedList() { /* When we begin a list, we have to close the last Paragraph, * if it s still open. */ endParagraph(); /* It is required that an UnorderedList is closed explicit. */ data.push(new UnorderedList(nesting)); nesting += 1; ignore_ws = true; } private static void endUnorderedList() { /* When we close an UnorderedList, we have to end the last * ListItem, if it's still open. */ endListItem(); endContainer(); nesting -= 1; } private static void beginTable() { /* When we start a table, we should close the last Paragraph, * if it's still open. */ endParagraph(); data.push(new Table()); ignore_ws = true; } private static void endTable() { /* We have to close the last TableRow, if it's still open. */ endTableRow(); /* We can now close the Table. */ endContainer(); } private static void beginTableRow() { // We have to close the previous TableRow, if it's still open. endTableRow(); //data.push(new TableRow()); ignore_ws = true; } private static void endTableRow() { /* We have to close the last TableData cell, if it's still * open. */ endTableData(); /* We can now try to tell the table that a TableRow has ended, * if there is a Table at the top of the stack. */ if (data.peek() instanceof Table) { /* We can now close the TableRow. */ //endContainer(); ((Table)data.peek()).endTableRow(); } ignore_ws = true; } private static void beginTableData() { /* We have to close the previous TableData cell, if it's still * open. */ endTableData(); data.push(new TableData()); /* A TableData cell also starts an implicit Paragraph */ beginParagraph(); } private static void endTableData() { /* We have to close the last Paragraph, if it's still open. */ endParagraph(); /* We can now try to close the TableRow, if there is one at * the top of the stack. */ if (data.peek() instanceof TableData) { /* We can now close the TableData. */ endContainer(); } ignore_ws = true; } private static boolean knownTag(String tag) { return (tag.equals("body") || tag.equals("title") || tag.equals("h1") || tag.equals("h2") || tag.equals("h3") || tag.equals("p") || tag.equals("b") || tag.equals("strong") || tag.equals("i") || tag.equals("em") || tag.equals("ul") || tag.equals("ol") || tag.equals("li") || tag.equals("blockquote") || tag.equals("img") || tag.equals("a") || tag.equals("table") || tag.equals("tr") || tag.equals("td") || tag.equals("th") || tag.equals("br") || tag.equals("hr") || tag.equals("code") || tag.equals("tt") || tag.equals("pre")); } private static void debugStack(Stack s) { Stack tmp = new Stack(); while (s.size() > 0) { Browser.debug(s.peek().toString()); tmp.push(s.pop()); } while (tmp.size() > 0) { s.push(tmp.pop()); } } private static Font deriveFont(String tag, Font current) { Font derived = null; if (tag.equals("i") || tag.equals("em")) { /* We turn italics on if it was off, and vice versa. */ if (current.isItalic()) { derived = current.deriveFont(current.getStyle() - Font.ITALIC); } else { derived = current.deriveFont(current.getStyle() + Font.ITALIC); } } else if (tag.equals("b") || tag.equals("strong") || tag.equals("th")) { /* We turn on bold unconditionally */ if (!current.isBold()) { derived = current.deriveFont(current.getStyle() + Font.BOLD); } else { derived = current.deriveFont(current.getStyle()); } } else if (tag.equals("h1")) { derived = current.deriveFont((float)24.0); } else if (tag.equals("h2")) { derived = current.deriveFont((float)18.0); } else if (tag.equals("h3")) { if (current.isItalic()) { derived = current.deriveFont(current.getStyle() - Font.ITALIC, (float)16.0); } else { derived = current.deriveFont(current.getStyle() + Font.ITALIC, (float)16.0); } } else if (tag.equals("code") || tag.equals("tt") || tag.equals("pre")) { derived = new Font("Monospaced", current.getStyle(), current.getSize()); } else if (current == null) { derived = new Font(null, Font.PLAIN, 14); } else { derived = current; } return derived; } private static class TokenSequence { Vector v; char c; Token t; public TokenSequence cut() { if (!v.isEmpty()) v.removeElementAt(0); // v.removeElementAt(0); return this; } public boolean empty() { return v.isEmpty(); } public boolean space() { return ((Token)v.elementAt(0)).space; } public String word() { return ((Token)v.elementAt(0)).word; } public Tag tag() { return ((Token)v.elementAt(0)).tag; } private void get(DataInputStream stream) throws IOException { int b; b = stream.readByte(); if (b<0) b+=256; c = (char)b; } private void skipBlanks(DataInputStream stream) throws IOException { while (c==' ' || c == '\t' || c == '\n' || c == '\r') get(stream); } private void readWord(DataInputStream stream) throws IOException { StringBuffer b = new StringBuffer(); while (c!=' ' && c!= '\t' && c!='\n' && c!='<' && c!='&' && c!='\r') { b.append(c); get(stream); } t.word = b.toString(); } private String readName(DataInputStream stream) { StringBuffer b = new StringBuffer(); try { if (c=='"') { get(stream); while (c!='"') { b.append(c); get(stream); } get(stream); } else { while (c != ' ' && c != '\t' && c != '\n' && c != '>' && c != '=' && c != '\r') { b.append(c); get(stream); } } } catch (IOException x) { return b.toString(); } return b.toString(); } private void readTag(DataInputStream stream) throws IOException { StringBuffer b = new StringBuffer(); String left,right; t.tag = new Tag(); skipBlanks(stream); if (c == '/') { t.tag.start = false; get(stream); } skipBlanks(stream); t.tag.name = readName(stream).toLowerCase(); skipBlanks(stream); while (c != '>') { left = readName(stream).toLowerCase(); skipBlanks(stream); if (c == '=') { get(stream); skipBlanks(stream); right = readName(stream); } else { right = null; } t.tag.args.add(left, right); skipBlanks(stream); } get(stream); if (t.tag.start) skipBlanks(stream); } private void readSpecial(DataInputStream stream) { StringBuffer b = new StringBuffer(); String s; try { get(stream); if (c == ' ') { t.word = "&"; return; } if (c == '#') { get(stream); while (c != ';') { b.append(c); get(stream); } get(stream); s = b.toString(); try { int i; i = Integer.parseInt(s); char a[] = { (char)i }; t.word = new String(a); } catch (NumberFormatException x) { t.word = "&#" + s +";"; } return; } while (c != ';') { b.append(c); get(stream); } get(stream); s = b.toString(); if (s.equals("amp")) { t.word = "&"; return; } if (s.equals("gt")) { t.word = ">"; return; } if (s.equals("lt")) { t.word = "<"; return; } if (s.equals("quot")) { t.word = "\""; return; } if (s.equals("nbsp")) { t.word = " "; return; } if (s.equals("aring")) { t.word = "å"; return; } if (s.equals("Aring")) { t.word = "Å"; return; } if (s.equals("oslash")) { t.word = "ø"; return; } if (s.equals("Oslash")) { t.word = "Ø"; return; } if (s.equals("aelig")) { t.word = "æ"; return; } if (s.equals("AElig")) { t.word = "Æ"; return; } if (s.equals("copy")) { t.word = "©"; return; } if (s.equals("reg")) { t.word = "®"; return; } if (s.equals("aacute")) { t.word = "á"; return; } if (s.equals("agrave")) { t.word = "à"; return; } if (s.equals("Aacute")) { t.word = "Á"; return; } if (s.equals("Agrave")) { t.word = "À"; return; } if (s.equals("eacute")) { t.word = "é"; return; } if (s.equals("egrave")) { t.word = "è"; return; } if (s.equals("Eacute")) { t.word = "É"; return; } if (s.equals("Egrave")) { t.word = "È"; return; } t.word = "&" + s +";"; } catch (IOException x) {} } public void skipMeta(DataInputStream stream) throws IOException { boolean zero = true; boolean even = true; t = null; while (true) { get(stream); if (c=='-') { if (zero) zero=false; else { zero = true; even = !even; } } else if (c == '>') { if (even) { get(stream); return; } else zero = true; } else zero = true; } } public TokenSequence() { v = new Vector(0); } public void tokenize(DataInputStream stream) { try { get(stream); while (true) { t = new Token(); switch (c) { case '\r': case ' ': case '\t': case '\n': skipBlanks(stream); t.space = true; break; case '<': get(stream); if (c == '!') skipMeta(stream); else readTag(stream); break; case '&': readSpecial(stream); break; default: readWord(stream); break; } if (t!=null) v.addElement(t); } } catch (IOException x) { return; } } private class Token { public String word; public Tag tag; public boolean space; public Token() { word = null; tag = null; space = false; } } private class Tag{ public String name; public boolean start; public Arguments args; public Tag() { this.name = null; this.start = true; this.args = new Arguments(); } } private class Arguments { String left; String right; Arguments next; boolean empty; public Arguments() { empty = true; next = null; } public Arguments(String left, String right, Arguments next) { this.left = left; this.right = right; this.next = next; this.empty = false; } public void add(String left, String right) { if (!empty) this.next = new Arguments(this.left, this.right, this.next); this.left = left; this.right = right; this.empty = false; } public String lookup(String left) { if (empty) return null; if (left.equals(this.left)) return this.right; if (next == null) return null; return next.lookup(left); } } } }