package html

pakakeh.go – git.sr.ht/~shulhan/pakakeh.go/lib/html Index | Examples | Files

package html

import "git.sr.ht/~shulhan/pakakeh.go/lib/html"

Package html extends the golang.org/x/net/html by providing simplified methods to Node.

The x/net/html package currently only provide bare raw functionalities to iterate tree, there is no check for empty node, and no function to get attribute by name without looping it manually.

This package extends the parent package by adding methods to get node's attribute by name, get the first non-empty child, get the next non-empty sibling, and method to iterate the tree.

Index ¶

func NormalizeForID(in string) (out string)
func Sanitize(in []byte) (plain []byte)
type Node

func NewNode(el *html.Node) *Node
func (node *Node) GetAttrValue(key string) string
func (node *Node) GetFirstChild() *Node
func (node *Node) GetNextSibling() *Node
func (node *Node) IsElement() bool

type NodeIterator

func Parse(r io.Reader) (iter *NodeIterator, err error)
func (iter *NodeIterator) Next() *Node
func (iter *NodeIterator) SetNext(el *Node)
func (iter *NodeIterator) SetNextNode(el *html.Node)

Functions ¶

func NormalizeForID ¶

func NormalizeForID(in string) (out string)

NormalizeForID given an input string normalize it to HTML ID. The normalization follow Mozilla specification rules,

it must not contain whitespace (spaces, tabs etc.),
only ASCII letters, digits, '_', and '-' should be used, and
it should start with a letter.

This function,

Return "_" if input is empty string,
replace unknown characters with '_',
prefix output with '_' unless it start with '-', '_', or letters, and
convert letters to lower cases.

Example¶

Code:

{
	fmt.Println(NormalizeForID(""))
	fmt.Println(NormalizeForID(" id "))
	fmt.Println(NormalizeForID(" ID "))
	fmt.Println(NormalizeForID("_id.1"))
	fmt.Println(NormalizeForID("1-d"))
	fmt.Println(NormalizeForID(".123 ABC def"))
	fmt.Println(NormalizeForID("test 123"))
	fmt.Println(NormalizeForID("⌘"))
	// Output:
	// _
	// _id_
	// _id_
	// _id_1
	// _1-d
	// _123_abc_def
	// test_123
	// ___
}

Output:

_
_id_
_id_
_id_1
_1-d
_123_abc_def
test_123
___

func Sanitize ¶

func Sanitize(in []byte) (plain []byte)

Sanitize the content of HTML into plain text.

Example¶

Code:

{
	input := `
<html>
	<title>Test</title>
	<head>
	</head>
	<body>
		This
		<p> is </p>
		a
		<a href="/">link</a>.
		An another
		<a href="/">link</a>.
	</body>
</html>
`

	out := Sanitize([]byte(input))
	fmt.Printf("%s", out)

	// Output:
	// This is a link. An another link.
}

Output:

This is a link. An another link.

Types ¶

type Node ¶

type Node struct {
	*html.Node
}

Node extends the html.Node.

func NewNode ¶

func NewNode(el *html.Node) *Node

NewNode create new node by embedding html.Node "el".

func (*Node) GetAttrValue ¶

func (node *Node) GetAttrValue(key string) string

GetAttrValue get the value of node's attribute with specific key or empty if key not found.

func (*Node) GetFirstChild ¶

func (node *Node) GetFirstChild() *Node

GetFirstChild get the first non-empty child of node or nil if no child left.

func (*Node) GetNextSibling ¶

func (node *Node) GetNextSibling() *Node

GetNextSibling get the next non-empty sibling of node or nil if no more sibling left.

func (*Node) IsElement ¶

func (node *Node) IsElement() bool

IsElement will return true if node type is html.ElementNode.

type NodeIterator ¶

type NodeIterator struct {
	// contains filtered or unexported fields
}

NodeIterator simplify iterating each node from top to bottom.

func Parse ¶

func Parse(r io.Reader) (iter *NodeIterator, err error)

Parse returns the NodeIterator to iterate through HTML tree.

Example¶

Code:

{
	rawHTML := `
<ul>
	<li>
		<b>item</b>
		<span>one</span>
	</li>
</ul>
`

	r := strings.NewReader(rawHTML)

	iter, err := Parse(r)
	if err != nil {
		log.Fatal(err)
	}

	for node := iter.Next(); node != nil; node = iter.Next() {
		if node.IsElement() {
			fmt.Printf("%s\n", node.Data)
		} else {
			fmt.Printf("\t%s\n", node.Data)
		}
	}

	// Output:
	// html
	// head
	// body
	// ul
	// li
	// b
	// 	item
	// b
	// span
	// 	one
	// span
	// li
	// ul
	// body
	// html
}

Output:

html
head
body
ul
li
b
	item
b
span
	one
span
li
ul
body
html

func (*NodeIterator) Next ¶

func (iter *NodeIterator) Next() *Node

Next return the first child or the next sibling of current node. If no more node in the tree, it will return nil.

func (*NodeIterator) SetNext ¶

func (iter *NodeIterator) SetNext(el *Node)

SetNext set the node for iteration to Node "el" only if its not nil.

Example¶

Code:

{
	rawHTML := `
<ul>
	<li>
		<b>item</b>
		<span>one</span>
	</li>
</ul>
<h2>Jump here</h2>
`

	r := strings.NewReader(rawHTML)

	iter, err := Parse(r)
	if err != nil {
		log.Fatal(err)
	}

	for node := iter.Next(); node != nil; node = iter.Next() {
		if node.IsElement() {
			if node.Data == "ul" {
				// Skip iterating the "ul" element.
				iter.SetNext(node.GetNextSibling())
				continue
			}
			fmt.Printf("%s\n", node.Data)
		} else {
			fmt.Printf("\t%s\n", node.Data)
		}
	}

	// Output:
	// html
	// head
	// body
	// h2
	// 	Jump here
	// h2
	// body
	// html
}

Output:

html
head
body
h2
	Jump here
h2
body
html

func (*NodeIterator) SetNextNode ¶

func (iter *NodeIterator) SetNextNode(el *html.Node)

SetNextNode set the next iteration node to html.Node "el" only if its not nil.

Source Files ¶

html.go node.go node_iterator.go

Version: v0.60.1 (latest)
Published: Apr 18, 2025
Platform: linux/amd64
Imports: 5 packages
Last checked: 3 weeks ago –

Tools for package owners.

?	: This menu
/	: Search site
f	: Jump to identifier
g then g	: Go to top of page
g then b	: Go to end of page
G	: Go to end of page
g then i	: Go to index
g then e	: Go to examples

package html

Index ¶

Examples ¶

Functions ¶

func NormalizeForID ¶

func Sanitize ¶

Types ¶

type Node ¶

func NewNode ¶

func (*Node) GetAttrValue ¶

func (*Node) GetFirstChild ¶

func (*Node) GetNextSibling ¶

func (*Node) IsElement ¶

type NodeIterator ¶

func Parse ¶

func (*NodeIterator) Next ¶

func (*NodeIterator) SetNext ¶

func (*NodeIterator) SetNextNode ¶

Source Files ¶

Jump to identifier

Keyboard shortcuts