#region Copyright 2010-2014 by Roger Knapp, Licensed under the Apache License, Version 2.0 /* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #endregion using System; using System.Collections.Generic; using NUnit.Framework; using CSharpTest.Net.Html; using System.IO; using System.Text.RegularExpressions; using CSharpTest.Net.Utils; using System.Net; using System.Xml.XPath; using CSharpTest.Net.IO; using XmlDocument = System.Xml.XmlDocument; using System.Xml; namespace CSharpTest.Net.Library.Test { [TestFixture] public partial class TestHtmlParser { const string document = @" Document Title cdata! ]]>
Hi
,

this is content. "; private string Normalize(string text) { text = text.Replace('\r', ' ').Replace('\n', ' ').Replace('\t', ' ').Trim(); while (text.IndexOf(" ") >= 0) text = text.Replace(" ", " "); return text; } [Test] public void TestDocUnformatted() { string docText = @""; XmlLightDocument doc = new XmlLightDocument(docText); string content; TextWriter sw = new StringWriter(); doc.WriteUnformatted(sw); content = sw.ToString(); Assert.AreEqual(docText, content); using (MemoryStream ms = new MemoryStream()) { sw = new StreamWriter(ms); doc.WriteUnformatted(sw); sw.Flush(); ms.Position = 0; StreamReader sr = new StreamReader(ms); content = sr.ReadToEnd(); Assert.AreEqual(docText, content); } } [Test] public void TestDocToXml() { HtmlLightDocument doc = new HtmlLightDocument(); XmlLightElement body = new XmlLightElement(new XmlLightElement(doc, "html"), "body"); body.IsEmpty = false; body.Attributes.Add("id", "bdy"); Assert.AreEqual(" ", Normalize(doc.InnerXml)); } [Test] public void TestXmlNamespace() { string xml = @""; XmlLightDocument doc = new XmlLightDocument(xml); Assert.AreEqual(xml, Normalize(doc.InnerXml)); } [Test] public void TestXmlNamespacePrefix() { string xml = @" hello

world

6ft 155 lbs ".Replace('\'', '"'); XmlLightDocument doc = new XmlLightDocument(xml); Assert.AreEqual(Normalize(xml), Normalize(doc.InnerXml)); } [Test] public void TestHtmlEntityRef() { string html = @" <  !""> this char '<' and this one '>' and this one '&' should be encoded. We encoded '   ' and à and ' ' and ' ' all by ourselves. This in not valid xml �, nor is �, but we still allow it. This entity name will pass-through &unknown; this will not &whateverthatmeans; and nor will these &; &#; &h; &l t; &1two; &234; g; &#-123;. "; string expect = @" this char '<' and this one '>' and this one '&' should be encoded. We encoded '   ' and à and ' ' and ' ' all by ourselves. This in not valid xml �, nor is �, but we still allow it. This entity name will pass-through &unknown; this will not &whateverthatmeans; and nor will these &; &#; &h; &l t; &1two; &234; &#x00fg; &#-123;. "; XmlLightDocument doc = new HtmlLightDocument(html); XmlWriterSettings settings = new XmlWriterSettings() { CheckCharacters = true, Indent = false, IndentChars = "", NewLineChars = "", NewLineHandling = NewLineHandling.None, OmitXmlDeclaration = true, CloseOutput = false }; StringWriter sw = new StringWriter(); XmlWriter wtr = XmlWriter.Create(sw, settings); doc.WriteXml(wtr); wtr.Flush(); string xml = sw.ToString(); Assert.AreEqual(expect, xml); } [Test] public void TestParseDocument() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightDocument doc2; using (TempFile t = new TempFile()) { using (TextWriter tw = new StreamWriter(t.Open())) doc.WriteXml(tw); new XhtmlValidation(XhtmlDTDSpecification.XhtmlTransitional_10).Validate(t.TempPath); doc2 = new XmlLightDocument(t.ReadAllText()); Assert.AreEqual(doc.InnerXml, doc2.InnerXml); } } [Test] public void TestParseAttributes() { IEnumerator en; en = XmlLightParser.ParseAttributes("").GetEnumerator(); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("a", en.Current.Name); Assert.AreEqual("1", en.Current.Value); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("b", en.Current.Name); Assert.AreEqual("2", en.Current.Value); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("c", en.Current.Name); Assert.AreEqual("3", en.Current.Value); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("d", en.Current.Name); Assert.AreEqual(null, en.Current.Value); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("e", en.Current.Name); Assert.AreEqual("", en.Current.Value); Assert.IsFalse(en.MoveNext()); en = XmlLightParser.ParseAttributes("").GetEnumerator(); Assert.IsTrue(en.MoveNext()); Assert.AreEqual("version", en.Current.Name); Assert.AreEqual("1.0", en.Current.Value); Assert.IsFalse(en.MoveNext()); } [Test] public void TestParseText() { string text = Normalize(XmlLightParser.ParseText(document)); Assert.AreEqual("Document Title this is > cdata! Hi, this is content.", text); } [Test] public void TestXPath() { XmlDocument xdoc = new XmlDocument(); XmlLightDocument doc = new HtmlLightDocument(document); string testpath = "/html/body[@id='one' and @class='cls']/../body/div[@id='two' and text() = 'Hi']/@id"; xdoc.LoadXml(doc.CreateNavigator().InnerXml); Assert.IsNotNull(xdoc.SelectSingleNode(testpath)); XPathNavigator nav = doc.CreateNavigator().SelectSingleNode(testpath); Assert.IsNotNull(nav); Assert.IsTrue(nav.NodeType == XPathNodeType.Attribute); Assert.AreEqual("id", nav.Name); Assert.AreEqual("two", nav.Value); XmlLightElement e = doc.SelectSingleNode("/html/Head"); Assert.IsNull(e); e = doc.SelectSingleNode("/html/head"); Assert.IsNotNull(e); } [Test] public void TestXmlElement() { XmlLightDocument doc = new HtmlLightDocument(document); Assert.IsNull(doc.PrevSibling); Assert.IsNull(doc.Children[0].PrevSibling); Assert.IsNull(doc.NextSibling); Assert.IsNull(doc.Children[doc.Children.Count - 1].NextSibling); XmlLightElement e = doc.SelectSingleNode("/html/body//*[@class='2']"); Assert.IsNotNull(e); Assert.AreEqual("p", e.TagName); Assert.IsNotNull(e.PrevSibling); Assert.AreEqual("p", e.PrevSibling.TagName); Assert.AreEqual("", e.Namespace); Assert.AreEqual("p", e.LocalName); e = new XmlLightElement(null, "a:b"); Assert.AreEqual("a", e.Namespace); Assert.AreEqual("b", e.LocalName); } [Test] public void TestXmlNavigator() { XPathNavigator nav = new HtmlLightDocument(document).CreateNavigator().SelectSingleNode("/html/body//p[@class='1']"); XPathNavigator pos = nav.Clone(); Assert.IsFalse(nav.MoveToPrevious()); Assert.IsTrue(nav.MoveToNext()); Assert.IsTrue(nav.MoveToPrevious()); Assert.IsTrue(nav.IsSamePosition(pos)); Assert.IsFalse(nav.MoveToFirstNamespace()); Assert.IsFalse(nav.MoveToNextNamespace()); Assert.IsTrue(Object.ReferenceEquals(nav.NameTable, pos.NameTable)); Assert.IsNotNull(nav.BaseURI); Assert.AreEqual(nav.BaseURI, pos.BaseURI); Assert.IsTrue(nav.MoveToId("one")); Assert.AreEqual("body", nav.Name); Assert.IsFalse(nav.MoveToId("none-exists")); Assert.AreEqual("body", nav.Name); } [Test] public void TestInnerText() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightElement e = doc.SelectSingleNode("/html/body"); Assert.AreEqual("this is > cdata! Hi, this is content.", Normalize(e.InnerText)); Assert.AreEqual("Hi", e.SelectSingleNode(".//div[@id='two']").InnerText); Assert.AreEqual("this is > cdata!", Normalize(e.SelectSingleNode("text()").InnerText)); } [Test] public void TestComments() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightElement e = doc.SelectSingleNode("/html/head"); e = e.NextSibling; Assert.IsTrue(e.IsComment); Assert.AreEqual("", e.InnerXml); } [Test] public void TestParsers() { string notxml = ""; HtmlLightDocument html = new HtmlLightDocument(); XmlLightParser.Parse(notxml, html); Assert.AreEqual("html", html.Root.TagName); Assert.AreEqual(1, html.Root.Attributes.Count); Assert.AreEqual("a", html.Root.Attributes["id"]); Assert.AreEqual(1, html.Root.Children.Count); Assert.AreEqual("body", html.Root.Children[0].TagName); Assert.AreEqual("foo", html.Root.Children[0].Attributes["bar"]); Assert.AreEqual("bar", html.Root.Children[0].Attributes["foo"]); XmlLightDocument xml = new XmlLightDocument(); XmlLightParser.Parse(notxml, XmlLightParser.AttributeFormat.Xml, xml); Assert.AreEqual(2, xml.Root.Attributes.Count); //Not recognized: xml.Root.Attributes["id"] Assert.AreEqual("body", xml.Root.TagName); Assert.AreEqual("foo", xml.Root.Attributes["bar"]); Assert.AreEqual("bar", xml.Root.Attributes["foo"]); } [Test] public void TestAttributes() { string xml = ""; XmlLightDocument doc = new XmlLightDocument(xml); Assert.AreEqual("root", doc.Root.LocalName); Assert.AreEqual(1, doc.Root.Attributes.Count); Assert.IsTrue(doc.Root.Attributes.GetEnumerator().MoveNext()); Assert.IsTrue(((System.Collections.IEnumerable)doc.Root.Attributes).GetEnumerator().MoveNext()); Assert.IsTrue(doc.Root.Attributes.Remove("id")); Assert.AreEqual(0, doc.Root.Attributes.Count); } [Test] public void TestManuallyCreated() { XmlLightElement root = new XmlLightElement(null, "root"); new XmlLightElement(root, "a").Attributes["b"] = "c"; new XmlLightElement(root, XmlLightElement.TEXT).Value = "Normal & Text"; new XmlLightElement(root, XmlLightElement.COMMENT).OriginalTag = ""; new XmlLightElement(root, XmlLightElement.CONTROL){ OriginalTag = "" }.Remove(); StringWriter sw = new StringWriter(); root.WriteUnformatted(sw); Assert.AreEqual("Normal & <Encoded> Text", sw.ToString()); } [Test, Explicit] public void RunPerfTests() { string[] files = Directory.GetFiles(@"c:\temp\trash", "*.htm", SearchOption.AllDirectories); System.Diagnostics.Stopwatch sw; for (int i = 0; i < 10; i++) { //HTML Parser sw = new System.Diagnostics.Stopwatch(); sw.Start(); foreach (string file in files) new HtmlLightDocument(File.ReadAllText(file)); Console.WriteLine("HTML = {0}", sw.ElapsedMilliseconds); //XML Parser sw = new System.Diagnostics.Stopwatch(); sw.Start(); foreach (string file in files) new XmlLightDocument(File.ReadAllText(file)); Console.WriteLine("XHTM = {0}", sw.ElapsedMilliseconds); //Parse Only sw = new System.Diagnostics.Stopwatch(); sw.Start(); IXmlLightReader rdr = new EmptyReader(); foreach (string file in files) XmlLightParser.Parse(File.ReadAllText(file), XmlLightParser.AttributeFormat.Xml, rdr); Console.WriteLine("NDOM = {0}", sw.ElapsedMilliseconds); //Text Only sw = new System.Diagnostics.Stopwatch(); sw.Start(); foreach (string file in files) XmlLightParser.ParseText(File.ReadAllText(file)); Console.WriteLine("TEXT = {0}", sw.ElapsedMilliseconds); } } class EmptyReader : IXmlLightReader { public void AddCData(string cdata) { } public void AddComment(string comment) { } public void AddControl(string cdata) { } public void AddInstruction(string instruction) { } public void AddText(string content) { } public void EndDocument() { } public void EndTag(XmlTagInfo tag) { } public void StartDocument() { } public void StartTag(XmlTagInfo tag) { } } [Test, ExpectedException(typeof(System.Xml.XmlException))] public void TestXmlNoRootNode() { new XmlLightDocument("no xml root node defined"); } [Test, ExpectedException(typeof(System.Xml.XmlException))] public void TestXmlNoClosingTag() { new XmlLightDocument(""); } [Test, ExpectedException(typeof(System.Xml.XmlException))] public void TestXmlWrongClosingTag() { new XmlLightDocument(""); } [Test, ExpectedException(typeof(System.ApplicationException))] public void TestRootNodeNotHtml() { new HtmlLightDocument(""); } } }