summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Hanley <felix@userspace.com.au>2018-11-21 04:57:53 +0000
committerFelix Hanley <felix@userspace.com.au>2018-11-21 04:57:53 +0000
commit914623e6160d851fe33d54079a9058e4954127fc (patch)
tree2471ad9e13e917fe35db9b26ce90f2e7378e34a0
parented290df356a23145759b7c260749a60e4ae8ab65 (diff)
downloadquery-914623e6160d851fe33d54079a9058e4954127fc.tar.gz
query-914623e6160d851fe33d54079a9058e4954127fc.tar.bz2
Add HTML parser wrapper
-rw-r--r--html/node.go86
-rw-r--r--html/parse.go61
-rw-r--r--html/parse_test.go80
3 files changed, 227 insertions, 0 deletions
diff --git a/html/node.go b/html/node.go
new file mode 100644
index 0000000..ed51aaf
--- /dev/null
+++ b/html/node.go
@@ -0,0 +1,86 @@
+package html
+
+import (
+ "fmt"
+
+ x "golang.org/x/net/html"
+ base "src.userspace.com.au/query"
+)
+
+// A Node consists of a NodeType and some data (tag name for
+// element nodes, content for text) and are part of a tree of Nodes.
+type Node struct {
+ parent, prevSibling, nextSibling, firstChild, lastChild *Node
+
+ level int
+
+ *x.Node
+}
+
+func (n *Node) Parent() base.Node {
+ if n.parent == nil {
+ return nil
+ }
+ return &Node{Node: n.Node.Parent}
+}
+func (n *Node) NextSibling() base.Node {
+ if n.nextSibling == nil {
+ return nil
+ }
+ return &Node{Node: n.Node.NextSibling}
+}
+func (n *Node) FirstChild() base.Node {
+ if n.firstChild == nil {
+ return nil
+ }
+ return &Node{Node: n.Node.FirstChild}
+}
+func (n *Node) PrevSibling() base.Node { return &Node{Node: n.Node.PrevSibling} }
+func (n *Node) LastChild() base.Node { return &Node{Node: n.Node.LastChild} }
+func (n *Node) Type() base.NodeType { return base.NodeType(n.Node.Type) }
+func (n *Node) DataType() string { return "string" }
+func (n *Node) Attr() []base.Attribute {
+ out := make([]base.Attribute, len(n.Node.Attr))
+ for _, a := range n.Node.Attr {
+ out = append(out, base.Attribute(a))
+ }
+ return out
+}
+
+// Data gets the value of the node and all its child nodes.
+func (n *Node) Data() string { return n.Node.Data }
+
+// InnerText gets the value of the node and all its child nodes.
+func (n *Node) InnerText() string {
+ // FIXME
+ return n.Node.Data
+}
+
+func (n Node) String() string {
+ return fmt.Sprintf("[%s] %s(%s)", base.NodeNames[n.Type()], n.DataType(), n.Data())
+}
+
+func (n Node) PrintTree(level int) {
+ for i := 1; i <= level; i++ {
+ fmt.Printf(" ")
+ }
+ fmt.Println(n)
+ for c := n.firstChild; c != nil; c = c.nextSibling {
+ c.PrintTree(level + 1)
+ }
+}
+
+func (n *Node) appendChild(c *Node) {
+ if c.parent != nil || c.prevSibling != nil || c.nextSibling != nil {
+ panic("html: appendChild called for an attached child Node")
+ }
+ last := n.lastChild
+ if last != nil {
+ last.nextSibling = c
+ } else {
+ n.firstChild = c
+ }
+ n.lastChild = c
+ c.parent = n
+ c.prevSibling = last
+}
diff --git a/html/parse.go b/html/parse.go
new file mode 100644
index 0000000..05359ec
--- /dev/null
+++ b/html/parse.go
@@ -0,0 +1,61 @@
+package html
+
+import (
+ //"fmt"
+ "io"
+
+ x "golang.org/x/net/html"
+)
+
+func Parse(r io.Reader) (*Node, error) {
+ xnode, err := x.Parse(r)
+ if err != nil {
+ return nil, err
+ }
+ /*
+ if len(xnodes) > 1 {
+ return nil, fmt.Errorf("found multiple HTML roots: %d", len(xnodes))
+ }
+ */
+
+ root := wrapNodes(xnode, 0)
+ return root, nil
+}
+
+func wrapNodes(root *x.Node, l int) *Node {
+ out := &Node{Node: root, level: l}
+
+ for c := root.FirstChild; c != nil; c = c.NextSibling {
+ child := wrapNodes(c, l+1)
+ out.appendChild(child)
+ }
+ /*
+ if root.Parent != nil {
+ out.parent = &Node{Node: root.Parent}
+ if l > 0 {
+ out.parent.level = l - 1
+ }
+ }
+
+ if root.FirstChild != nil {
+ out.firstChild = wrapNodes(root.FirstChild, l+1)
+ }
+
+ if root.NextSibling != nil {
+ out.nextSibling = wrapNodes(root.NextSibling, l)
+ }
+
+ if root.LastChild != nil {
+ out.lastChild = wrapNodes(root.LastChild, l+1)
+ }
+ if root.PrevSibling != nil {
+ //out.prevSibling = wrapNodes(root.prevSibling, l)
+ out.prevSibling = &Node{
+ Node: root.PrevSibling,
+ level: l,
+ parent: out.parent,
+ }
+ }
+ */
+ return out
+}
diff --git a/html/parse_test.go b/html/parse_test.go
new file mode 100644
index 0000000..cfc6997
--- /dev/null
+++ b/html/parse_test.go
@@ -0,0 +1,80 @@
+package html
+
+import (
+ "strings"
+ "testing"
+
+ base "src.userspace.com.au/query"
+)
+
+func TestParse(t *testing.T) {
+ src := `<html><body><p>One</p><p>Two</p></body></html>`
+
+ doc, err := Parse(strings.NewReader(src))
+ if err != nil {
+ t.Fatalf("Expected no error but got %s", err)
+ }
+ if doc == nil {
+ t.Fatal("Expected node but got nil")
+ }
+
+ //doc.PrintTree(0)
+
+ // document
+
+ nt := doc.Type()
+ nd := doc.Data()
+ if nt != base.DocumentNode {
+ t.Fatalf("Expected %q but got %q", "DocumentNode", nt)
+ }
+ if nd != "" {
+ t.Fatalf("Expected %q but got %q", "", nd)
+ }
+
+ // get <html>
+ n := doc.FirstChild()
+ if n == nil {
+ t.Fatal("Expected node but got nil")
+ }
+
+ nt = n.Type()
+ nd = n.Data()
+ if nt != base.ElementNode {
+ t.Fatalf("Expected %q but got %q", "ElementNode", nd)
+ }
+ if nd != "html" {
+ t.Fatalf("Expected %q but got %q", "html", nd)
+ }
+
+ // get <body>
+ //n = n.FirstChild()
+ // TODO why?
+ n = n.LastChild()
+ if n == nil {
+ t.Fatal("Expected node but got nil")
+ }
+
+ nt = n.Type()
+ nd = n.Data()
+ if nt != base.ElementNode {
+ t.Fatalf("Expected %q but got %q", "ElementNode", nd)
+ }
+ if nd != "body" {
+ t.Fatalf("Expected %q but got %q", "body", nd)
+ }
+
+ // get first <p>
+ n = n.LastChild()
+ if n == nil {
+ t.Fatal("Expected node but got nil")
+ }
+
+ nt = n.Type()
+ nd = n.Data()
+ if nt != base.ElementNode {
+ t.Fatalf("Expected %q but got %q", "ElementNode", nd)
+ }
+ if nd != "p" {
+ t.Fatalf("Expected %q but got %q", "p", nd)
+ }
+}