diff options
| author | Felix Hanley <felix@userspace.com.au> | 2018-11-21 04:57:53 +0000 |
|---|---|---|
| committer | Felix Hanley <felix@userspace.com.au> | 2018-11-21 04:57:53 +0000 |
| commit | 914623e6160d851fe33d54079a9058e4954127fc (patch) | |
| tree | 2471ad9e13e917fe35db9b26ce90f2e7378e34a0 | |
| parent | ed290df356a23145759b7c260749a60e4ae8ab65 (diff) | |
| download | query-914623e6160d851fe33d54079a9058e4954127fc.tar.gz query-914623e6160d851fe33d54079a9058e4954127fc.tar.bz2 | |
Add HTML parser wrapper
| -rw-r--r-- | html/node.go | 86 | ||||
| -rw-r--r-- | html/parse.go | 61 | ||||
| -rw-r--r-- | html/parse_test.go | 80 |
3 files changed, 227 insertions, 0 deletions
diff --git a/html/node.go b/html/node.go new file mode 100644 index 0000000..ed51aaf --- /dev/null +++ b/html/node.go @@ -0,0 +1,86 @@ +package html + +import ( + "fmt" + + x "golang.org/x/net/html" + base "src.userspace.com.au/query" +) + +// A Node consists of a NodeType and some data (tag name for +// element nodes, content for text) and are part of a tree of Nodes. +type Node struct { + parent, prevSibling, nextSibling, firstChild, lastChild *Node + + level int + + *x.Node +} + +func (n *Node) Parent() base.Node { + if n.parent == nil { + return nil + } + return &Node{Node: n.Node.Parent} +} +func (n *Node) NextSibling() base.Node { + if n.nextSibling == nil { + return nil + } + return &Node{Node: n.Node.NextSibling} +} +func (n *Node) FirstChild() base.Node { + if n.firstChild == nil { + return nil + } + return &Node{Node: n.Node.FirstChild} +} +func (n *Node) PrevSibling() base.Node { return &Node{Node: n.Node.PrevSibling} } +func (n *Node) LastChild() base.Node { return &Node{Node: n.Node.LastChild} } +func (n *Node) Type() base.NodeType { return base.NodeType(n.Node.Type) } +func (n *Node) DataType() string { return "string" } +func (n *Node) Attr() []base.Attribute { + out := make([]base.Attribute, len(n.Node.Attr)) + for _, a := range n.Node.Attr { + out = append(out, base.Attribute(a)) + } + return out +} + +// Data gets the value of the node and all its child nodes. +func (n *Node) Data() string { return n.Node.Data } + +// InnerText gets the value of the node and all its child nodes. +func (n *Node) InnerText() string { + // FIXME + return n.Node.Data +} + +func (n Node) String() string { + return fmt.Sprintf("[%s] %s(%s)", base.NodeNames[n.Type()], n.DataType(), n.Data()) +} + +func (n Node) PrintTree(level int) { + for i := 1; i <= level; i++ { + fmt.Printf(" ") + } + fmt.Println(n) + for c := n.firstChild; c != nil; c = c.nextSibling { + c.PrintTree(level + 1) + } +} + +func (n *Node) appendChild(c *Node) { + if c.parent != nil || c.prevSibling != nil || c.nextSibling != nil { + panic("html: appendChild called for an attached child Node") + } + last := n.lastChild + if last != nil { + last.nextSibling = c + } else { + n.firstChild = c + } + n.lastChild = c + c.parent = n + c.prevSibling = last +} diff --git a/html/parse.go b/html/parse.go new file mode 100644 index 0000000..05359ec --- /dev/null +++ b/html/parse.go @@ -0,0 +1,61 @@ +package html + +import ( + //"fmt" + "io" + + x "golang.org/x/net/html" +) + +func Parse(r io.Reader) (*Node, error) { + xnode, err := x.Parse(r) + if err != nil { + return nil, err + } + /* + if len(xnodes) > 1 { + return nil, fmt.Errorf("found multiple HTML roots: %d", len(xnodes)) + } + */ + + root := wrapNodes(xnode, 0) + return root, nil +} + +func wrapNodes(root *x.Node, l int) *Node { + out := &Node{Node: root, level: l} + + for c := root.FirstChild; c != nil; c = c.NextSibling { + child := wrapNodes(c, l+1) + out.appendChild(child) + } + /* + if root.Parent != nil { + out.parent = &Node{Node: root.Parent} + if l > 0 { + out.parent.level = l - 1 + } + } + + if root.FirstChild != nil { + out.firstChild = wrapNodes(root.FirstChild, l+1) + } + + if root.NextSibling != nil { + out.nextSibling = wrapNodes(root.NextSibling, l) + } + + if root.LastChild != nil { + out.lastChild = wrapNodes(root.LastChild, l+1) + } + if root.PrevSibling != nil { + //out.prevSibling = wrapNodes(root.prevSibling, l) + out.prevSibling = &Node{ + Node: root.PrevSibling, + level: l, + parent: out.parent, + } + } + */ + return out +} diff --git a/html/parse_test.go b/html/parse_test.go new file mode 100644 index 0000000..cfc6997 --- /dev/null +++ b/html/parse_test.go @@ -0,0 +1,80 @@ +package html + +import ( + "strings" + "testing" + + base "src.userspace.com.au/query" +) + +func TestParse(t *testing.T) { + src := `<html><body><p>One</p><p>Two</p></body></html>` + + doc, err := Parse(strings.NewReader(src)) + if err != nil { + t.Fatalf("Expected no error but got %s", err) + } + if doc == nil { + t.Fatal("Expected node but got nil") + } + + //doc.PrintTree(0) + + // document + + nt := doc.Type() + nd := doc.Data() + if nt != base.DocumentNode { + t.Fatalf("Expected %q but got %q", "DocumentNode", nt) + } + if nd != "" { + t.Fatalf("Expected %q but got %q", "", nd) + } + + // get <html> + n := doc.FirstChild() + if n == nil { + t.Fatal("Expected node but got nil") + } + + nt = n.Type() + nd = n.Data() + if nt != base.ElementNode { + t.Fatalf("Expected %q but got %q", "ElementNode", nd) + } + if nd != "html" { + t.Fatalf("Expected %q but got %q", "html", nd) + } + + // get <body> + //n = n.FirstChild() + // TODO why? + n = n.LastChild() + if n == nil { + t.Fatal("Expected node but got nil") + } + + nt = n.Type() + nd = n.Data() + if nt != base.ElementNode { + t.Fatalf("Expected %q but got %q", "ElementNode", nd) + } + if nd != "body" { + t.Fatalf("Expected %q but got %q", "body", nd) + } + + // get first <p> + n = n.LastChild() + if n == nil { + t.Fatal("Expected node but got nil") + } + + nt = n.Type() + nd = n.Data() + if nt != base.ElementNode { + t.Fatalf("Expected %q but got %q", "ElementNode", nd) + } + if nd != "p" { + t.Fatalf("Expected %q but got %q", "p", nd) + } +} |
