001 package com.mockrunner.util.web;
002
003 import java.io.StringReader;
004 import java.io.StringWriter;
005 import java.util.List;
006
007 import org.apache.commons.logging.Log;
008 import org.apache.commons.logging.LogFactory;
009 import org.apache.xerces.parsers.DOMParser;
010 import org.cyberneko.html.HTMLConfiguration;
011 import org.jdom.Element;
012 import org.jdom.input.DOMBuilder;
013 import org.jdom.output.XMLOutputter;
014 import org.xml.sax.InputSource;
015
016 import com.mockrunner.base.NestedApplicationException;
017
018 /**
019 * Util class for HTML and XML parsing.
020 */
021 public class XmlUtil
022 {
023 private final static Log log = LogFactory.getLog(XmlUtil.class);
024
025 /**
026 * Convinience method for HTML fragments. Returns the body
027 * as JDOM <code>Element</code>.
028 *
029 * If an HTML documents looks like this:
030 * <pre>
031 * <html>
032 * <head>
033 * </head>
034 * <body>
035 * <h1>
036 * </h1>
037 * </body>
038 * </html>
039 * </pre>
040 *
041 * the method returns the h1 tag as <code>Element</code>.
042 * @param document the <code>org.jdom.Document</code>
043 * @return the body <code>Element</code>
044 */
045 public static Element getBodyFragmentFromJDOMDocument(org.jdom.Document document)
046 {
047 Element element = document.getRootElement().getChild("BODY");
048 if(null == element)
049 {
050 element = document.getRootElement().getChild("body");
051 }
052 if(null != element)
053 {
054 List childs = element.getChildren();
055 if(null != childs && childs.size() > 0) return (Element)childs.get(0);
056 }
057 return null;
058 }
059
060 /**
061 * @deprecated use {@link #getBodyFragmentFromJDOMDocument}
062 */
063 public static Element getBodyFragmentJDOMDocument(org.jdom.Document document)
064 {
065 return getBodyFragmentFromJDOMDocument(document);
066 }
067
068 /**
069 * Returns the documents XML content as a string.
070 * @param document the <code>org.jdom.Document</code>
071 * @return the output as string
072 */
073 public static String createStringFromJDOMDocument(org.jdom.Document document)
074 {
075 try
076 {
077 XMLOutputter outputter = new XMLOutputter();
078 StringWriter writer = new StringWriter();
079 outputter.output(document, writer);
080 writer.flush();
081 return writer.toString();
082 }
083 catch(Exception exc)
084 {
085 log.error(exc.getMessage(), exc);
086 throw new NestedApplicationException(exc);
087 }
088 }
089
090 /**
091 * Creates a JDOM <code>Document</code> from a specified
092 * W3C <code>Document</code>.
093 * @param document the <code>org.w3c.dom.Document</code>
094 * @return the <code>org.jdom.Document</code>
095 */
096 public static org.jdom.Document createJDOMDocument(org.w3c.dom.Document document)
097 {
098 return new DOMBuilder().build(document);
099 }
100
101 /**
102 * Returns a parser suitable for parsing HTML documents.
103 * The NekoHTML parser is used with some settings to
104 * preserve case of tag names and disable namespace processing.
105 * This method is used by {@link #parseHTML}.
106 * @return instance of <code>org.apache.xerces.parsers.DOMParser</code>
107 * with Neko configuration
108 */
109 public static DOMParser getHTMLParser()
110 {
111 try
112 {
113 HTMLConfiguration config = new HTMLConfiguration();
114 config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
115 config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
116 DOMParser parser = new DOMParser(config);
117 return parser;
118 }
119 catch(Exception exc)
120 {
121 log.error(exc.getMessage(), exc);
122 throw new NestedApplicationException(exc);
123 }
124 }
125
126 /**
127 * Parses the specified HTML with the NekoHTML parser.
128 * If you want to use another HTML parser or configure
129 * the NekoHTML parser with special features, you can use
130 * the <code>parse</code> method.
131 * @param source the HTML as String
132 * @return the parsed document as org.w3c.dom.Document
133 */
134 public static org.w3c.dom.Document parseHTML(String source)
135 {
136 try
137 {
138 return parse(getHTMLParser(), source);
139 }
140 catch(Exception exc)
141 {
142 log.error(exc.getMessage(), exc);
143 throw new NestedApplicationException(exc);
144 }
145 }
146
147 /**
148 * Parses the specified XML with the specified parser.
149 * The main purpose of this method is to use the NekoHTML
150 * parser with custom features and properties. If you can live
151 * with the settings provided by Mockrunner, you can use
152 * {@link #parseHTML}.
153 * @param parser the parser (must extend
154 * <code>org.apache.xerces.parsers.DOMParser</code>),
155 * e.g. the one returned by {@link #getHTMLParser}
156 * @param source the XML as String
157 * @return the parsed document as org.w3c.dom.Document
158 */
159 public static org.w3c.dom.Document parse(DOMParser parser, String source)
160 {
161 try
162 {
163 parser.parse(new InputSource(new StringReader(source)));
164 return parser.getDocument();
165 }
166 catch(Exception exc)
167 {
168 log.error(exc.getMessage(), exc);
169 throw new NestedApplicationException(exc);
170 }
171 }
172 }