Skip to content

Commit 0833b63

Browse files
committed
publicsuffix: embed table data
Use //go:embed to embed the public suffix tables, rather than generating .go files containing the data. Creating an empty git repo and generating commits for the last 20 updates to the public suffix list, the total size of the repository directory as measured by "du -sh" decreases from 2.2M to 668K when using embedding. For golang/go#15518. Change-Id: Id71759765831a7699e7a182937095b3820bb643b Reviewed-on: https://go-review.googlesource.com/c/net/+/450935 Run-TryBot: Damien Neil <dneil@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Nigel Tao <nigeltao@golang.org> Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
1 parent ecf091a commit 0833b63

File tree

6 files changed

+134
-11018
lines changed

6 files changed

+134
-11018
lines changed

publicsuffix/data/children

2.81 KB
Binary file not shown.

publicsuffix/data/nodes

47.1 KB
Binary file not shown.

publicsuffix/data/text

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

publicsuffix/gen.go

Lines changed: 97 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package main
2222
import (
2323
"bufio"
2424
"bytes"
25+
"encoding/binary"
2526
"flag"
2627
"fmt"
2728
"go/format"
@@ -55,6 +56,7 @@ const (
5556
)
5657

5758
var (
59+
combinedText string
5860
maxChildren int
5961
maxTextOffset int
6062
maxTextLength int
@@ -115,11 +117,10 @@ var (
115117
shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
116118
dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
117119

118-
comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
119-
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
120-
url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
121-
v = flag.Bool("v", false, "verbose output (to stderr)")
122-
version = flag.String("version", "", "the effective_tld_names.dat version")
120+
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
121+
url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
122+
v = flag.Bool("v", false, "verbose output (to stderr)")
123+
version = flag.String("version", "", "the effective_tld_names.dat version")
123124
)
124125

125126
func main() {
@@ -254,7 +255,33 @@ func main1() error {
254255
}
255256
sort.Strings(labelsList)
256257

257-
if err := generate(printReal, &root, "table.go"); err != nil {
258+
combinedText = combineText(labelsList)
259+
if combinedText == "" {
260+
return fmt.Errorf("internal error: combineText returned no text")
261+
}
262+
for _, label := range labelsList {
263+
offset, length := strings.Index(combinedText, label), len(label)
264+
if offset < 0 {
265+
return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
266+
}
267+
maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
268+
if offset >= 1<<nodesBitsTextOffset {
269+
return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
270+
}
271+
if length >= 1<<nodesBitsTextLength {
272+
return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
273+
}
274+
labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
275+
}
276+
277+
if err := root.walk(assignIndexes); err != nil {
278+
return err
279+
}
280+
281+
if err := generate(printMetadata, &root, "table.go"); err != nil {
282+
return err
283+
}
284+
if err := generateBinaryData(&root, combinedText); err != nil {
258285
return err
259286
}
260287
if err := generate(printTest, &root, "table_test.go"); err != nil {
@@ -307,18 +334,63 @@ func printTest(w io.Writer, n *node) error {
307334
fmt.Fprintf(w, "%q,\n", rule)
308335
}
309336
fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
310-
if err := n.walk(w, printNodeLabel); err != nil {
337+
if err := n.walk(func(n *node) error {
338+
return printNodeLabel(w, n)
339+
}); err != nil {
311340
return err
312341
}
313342
fmt.Fprintf(w, "}\n")
314343
return nil
315344
}
316345

317-
func printReal(w io.Writer, n *node) error {
346+
func generateBinaryData(root *node, combinedText string) error {
347+
if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
348+
return err
349+
}
350+
351+
var nodes []byte
352+
if err := root.walk(func(n *node) error {
353+
for _, c := range n.children {
354+
nodes = appendNodeEncoding(nodes, c)
355+
}
356+
return nil
357+
}); err != nil {
358+
return err
359+
}
360+
if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
361+
return err
362+
}
363+
364+
var children []byte
365+
for _, c := range childrenEncoding {
366+
children = binary.BigEndian.AppendUint32(children, c)
367+
}
368+
if err := os.WriteFile("data/children", children, 0666); err != nil {
369+
return err
370+
}
371+
372+
return nil
373+
}
374+
375+
func appendNodeEncoding(b []byte, n *node) []byte {
376+
encoding := labelEncoding[n.label]
377+
if n.icann {
378+
encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
379+
}
380+
encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
381+
for i := nodesBits - 8; i >= 0; i -= 8 {
382+
b = append(b, byte((encoding>>i)&0xff))
383+
}
384+
return b
385+
}
386+
387+
func printMetadata(w io.Writer, n *node) error {
318388
const header = `// generated by go run gen.go; DO NOT EDIT
319389
320390
package publicsuffix
321391
392+
import _ "embed"
393+
322394
const version = %q
323395
324396
const (
@@ -343,74 +415,36 @@ const (
343415
// numTLD is the number of top level domains.
344416
const numTLD = %d
345417
418+
// text is the combined text of all labels.
419+
//
420+
//go:embed data/text
421+
var text string
422+
346423
`
347424
fmt.Fprintf(w, header, *version,
348425
nodesBits,
349426
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
350427
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
351428
nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
352-
353-
text := combineText(labelsList)
354-
if text == "" {
355-
return fmt.Errorf("internal error: makeText returned no text")
356-
}
357-
for _, label := range labelsList {
358-
offset, length := strings.Index(text, label), len(label)
359-
if offset < 0 {
360-
return fmt.Errorf("internal error: could not find %q in text %q", label, text)
361-
}
362-
maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
363-
if offset >= 1<<nodesBitsTextOffset {
364-
return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
365-
}
366-
if length >= 1<<nodesBitsTextLength {
367-
return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
368-
}
369-
labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
370-
}
371-
fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
372-
for len(text) > 0 {
373-
n, plus := len(text), ""
374-
if n > 64 {
375-
n, plus = 64, " +"
376-
}
377-
fmt.Fprintf(w, "%q%s\n", text[:n], plus)
378-
text = text[n:]
379-
}
380-
381-
if err := n.walk(w, assignIndexes); err != nil {
382-
return err
383-
}
384-
385429
fmt.Fprintf(w, `
386-
387430
// nodes is the list of nodes. Each node is represented as a %v-bit integer,
388431
// which encodes the node's children, wildcard bit and node type (as an index
389432
// into the children array), ICANN bit and text.
390433
//
391-
// If the table was generated with the -comments flag, there is a //-comment
392-
// after each node's data. In it is the nodes-array indexes of the children,
393-
// formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
394-
// nodeType is printed as + for normal, ! for exception, and o for parent-only
395-
// nodes that have children but don't match a domain label in their own right.
396-
// An I denotes an ICANN domain.
397-
//
398434
// The layout within the node, from MSB to LSB, is:
399435
// [%2d bits] unused
400436
// [%2d bits] children index
401437
// [%2d bits] ICANN bit
402438
// [%2d bits] text index
403439
// [%2d bits] text length
404-
var nodes = [...]uint8{
440+
//
441+
//go:embed data/nodes
442+
var nodes uint40String
405443
`,
406444
nodesBits,
407445
nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
408446
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
409-
if err := n.walk(w, printNode); err != nil {
410-
return err
411-
}
412-
fmt.Fprintf(w, `}
413-
447+
fmt.Fprintf(w, `
414448
// children is the list of nodes' children, the parent's wildcard bit and the
415449
// parent's node type. If a node has no children then their children index
416450
// will be in the range [0, 6), depending on the wildcard bit and node type.
@@ -421,27 +455,13 @@ var nodes = [...]uint8{
421455
// [%2d bits] node type
422456
// [%2d bits] high nodes index (exclusive) of children
423457
// [%2d bits] low nodes index (inclusive) of children
424-
var children=[...]uint32{
458+
//
459+
//go:embed data/children
460+
var children uint32String
425461
`,
426462
32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
427463
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
428-
for i, c := range childrenEncoding {
429-
s := "---------------"
430-
lo := c & (1<<childrenBitsLo - 1)
431-
hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
432-
if lo != hi {
433-
s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
434-
}
435-
nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
436-
wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
437-
if *comments {
438-
fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
439-
c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
440-
} else {
441-
fmt.Fprintf(w, "0x%x,\n", c)
442-
}
443-
}
444-
fmt.Fprintf(w, "}\n\n")
464+
445465
fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
446466
fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
447467
fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
@@ -465,12 +485,12 @@ type node struct {
465485
children []*node
466486
}
467487

468-
func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
469-
if err := f(w, n); err != nil {
488+
func (n *node) walk(f func(*node) error) error {
489+
if err := f(n); err != nil {
470490
return err
471491
}
472492
for _, c := range n.children {
473-
if err := c.walk(w, f); err != nil {
493+
if err := c.walk(f); err != nil {
474494
return err
475495
}
476496
}
@@ -516,7 +536,7 @@ var childrenEncoding = []uint32{
516536

517537
var firstCallToAssignIndexes = true
518538

519-
func assignIndexes(w io.Writer, n *node) error {
539+
func assignIndexes(n *node) error {
520540
if len(n.children) != 0 {
521541
// Assign nodesIndex.
522542
n.firstChild = nextNodesIndex
@@ -561,32 +581,6 @@ func assignIndexes(w io.Writer, n *node) error {
561581
return nil
562582
}
563583

564-
func printNode(w io.Writer, n *node) error {
565-
for _, c := range n.children {
566-
s := "---------------"
567-
if len(c.children) != 0 {
568-
s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
569-
}
570-
encoding := labelEncoding[c.label]
571-
if c.icann {
572-
encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
573-
}
574-
encoding |= uint64(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
575-
for i := nodesBits - 8; i >= 0; i -= 8 {
576-
fmt.Fprintf(w, "0x%02x, ", (encoding>>i)&0xff)
577-
}
578-
if *comments {
579-
fmt.Fprintf(w, "// n0x%04x c0x%04x (%s)%s %s %s %s\n",
580-
c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
581-
nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
582-
)
583-
} else {
584-
fmt.Fprintf(w, "\n")
585-
}
586-
}
587-
return nil
588-
}
589-
590584
func printNodeLabel(w io.Writer, n *node) error {
591585
for _, c := range n.children {
592586
fmt.Fprintf(w, "%q,\n", c.label)

publicsuffix/list.go

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,10 @@ loop:
101101
break
102102
}
103103

104-
u := uint32(nodeValue(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
104+
u := uint32(nodes.get(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
105105
icannNode = u&(1<<nodesBitsICANN-1) != 0
106106
u >>= nodesBitsICANN
107-
u = children[u&(1<<nodesBitsChildren-1)]
107+
u = children.get(u & (1<<nodesBitsChildren - 1))
108108
lo = u & (1<<childrenBitsLo - 1)
109109
u >>= childrenBitsLo
110110
hi = u & (1<<childrenBitsHi - 1)
@@ -154,18 +154,9 @@ func find(label string, lo, hi uint32) uint32 {
154154
return notFound
155155
}
156156

157-
func nodeValue(i uint32) uint64 {
158-
off := uint64(i * (nodesBits / 8))
159-
return uint64(nodes[off])<<32 |
160-
uint64(nodes[off+1])<<24 |
161-
uint64(nodes[off+2])<<16 |
162-
uint64(nodes[off+3])<<8 |
163-
uint64(nodes[off+4])
164-
}
165-
166157
// nodeLabel returns the label for the i'th node.
167158
func nodeLabel(i uint32) string {
168-
x := nodeValue(i)
159+
x := nodes.get(i)
169160
length := x & (1<<nodesBitsTextLength - 1)
170161
x >>= nodesBitsTextLength
171162
offset := x & (1<<nodesBitsTextOffset - 1)
@@ -189,3 +180,24 @@ func EffectiveTLDPlusOne(domain string) (string, error) {
189180
}
190181
return domain[1+strings.LastIndex(domain[:i], "."):], nil
191182
}
183+
184+
type uint32String string
185+
186+
func (u uint32String) get(i uint32) uint32 {
187+
off := i * 4
188+
return (uint32(u[off])<<24 |
189+
uint32(u[off+1])<<16 |
190+
uint32(u[off+2])<<8 |
191+
uint32(u[off+3]))
192+
}
193+
194+
type uint40String string
195+
196+
func (u uint40String) get(i uint32) uint64 {
197+
off := uint64(i * (nodesBits / 8))
198+
return uint64(u[off])<<32 |
199+
uint64(u[off+1])<<24 |
200+
uint64(u[off+2])<<16 |
201+
uint64(u[off+3])<<8 |
202+
uint64(u[off+4])
203+
}

0 commit comments

Comments
 (0)