@@ -22,6 +22,7 @@ package main
22
22
import (
23
23
"bufio"
24
24
"bytes"
25
+ "encoding/binary"
25
26
"flag"
26
27
"fmt"
27
28
"go/format"
@@ -55,6 +56,7 @@ const (
55
56
)
56
57
57
58
var (
59
+ combinedText string
58
60
maxChildren int
59
61
maxTextOffset int
60
62
maxTextLength int
@@ -115,11 +117,10 @@ var (
115
117
shaRE = regexp .MustCompile (`"sha":"([^"]+)"` )
116
118
dateRE = regexp .MustCompile (`"committer":{[^{]+"date":"([^"]+)"` )
117
119
118
- comments = flag .Bool ("comments" , false , "generate table.go comments, for debugging" )
119
- subset = flag .Bool ("subset" , false , "generate only a subset of the full table, for debugging" )
120
- url = flag .String ("url" , defaultURL , "URL of the publicsuffix.org list. If empty, stdin is read instead" )
121
- v = flag .Bool ("v" , false , "verbose output (to stderr)" )
122
- version = flag .String ("version" , "" , "the effective_tld_names.dat version" )
120
+ subset = flag .Bool ("subset" , false , "generate only a subset of the full table, for debugging" )
121
+ url = flag .String ("url" , defaultURL , "URL of the publicsuffix.org list. If empty, stdin is read instead" )
122
+ v = flag .Bool ("v" , false , "verbose output (to stderr)" )
123
+ version = flag .String ("version" , "" , "the effective_tld_names.dat version" )
123
124
)
124
125
125
126
func main () {
@@ -254,7 +255,33 @@ func main1() error {
254
255
}
255
256
sort .Strings (labelsList )
256
257
257
- if err := generate (printReal , & root , "table.go" ); err != nil {
258
+ combinedText = combineText (labelsList )
259
+ if combinedText == "" {
260
+ return fmt .Errorf ("internal error: combineText returned no text" )
261
+ }
262
+ for _ , label := range labelsList {
263
+ offset , length := strings .Index (combinedText , label ), len (label )
264
+ if offset < 0 {
265
+ return fmt .Errorf ("internal error: could not find %q in text %q" , label , combinedText )
266
+ }
267
+ maxTextOffset , maxTextLength = max (maxTextOffset , offset ), max (maxTextLength , length )
268
+ if offset >= 1 << nodesBitsTextOffset {
269
+ return fmt .Errorf ("text offset %d is too large, or nodeBitsTextOffset is too small" , offset )
270
+ }
271
+ if length >= 1 << nodesBitsTextLength {
272
+ return fmt .Errorf ("text length %d is too large, or nodeBitsTextLength is too small" , length )
273
+ }
274
+ labelEncoding [label ] = uint64 (offset )<< nodesBitsTextLength | uint64 (length )
275
+ }
276
+
277
+ if err := root .walk (assignIndexes ); err != nil {
278
+ return err
279
+ }
280
+
281
+ if err := generate (printMetadata , & root , "table.go" ); err != nil {
282
+ return err
283
+ }
284
+ if err := generateBinaryData (& root , combinedText ); err != nil {
258
285
return err
259
286
}
260
287
if err := generate (printTest , & root , "table_test.go" ); err != nil {
@@ -307,18 +334,63 @@ func printTest(w io.Writer, n *node) error {
307
334
fmt .Fprintf (w , "%q,\n " , rule )
308
335
}
309
336
fmt .Fprintf (w , "}\n \n var nodeLabels = [...]string{\n " )
310
- if err := n .walk (w , printNodeLabel ); err != nil {
337
+ if err := n .walk (func (n * node ) error {
338
+ return printNodeLabel (w , n )
339
+ }); err != nil {
311
340
return err
312
341
}
313
342
fmt .Fprintf (w , "}\n " )
314
343
return nil
315
344
}
316
345
317
- func printReal (w io.Writer , n * node ) error {
346
+ func generateBinaryData (root * node , combinedText string ) error {
347
+ if err := os .WriteFile ("data/text" , []byte (combinedText ), 0666 ); err != nil {
348
+ return err
349
+ }
350
+
351
+ var nodes []byte
352
+ if err := root .walk (func (n * node ) error {
353
+ for _ , c := range n .children {
354
+ nodes = appendNodeEncoding (nodes , c )
355
+ }
356
+ return nil
357
+ }); err != nil {
358
+ return err
359
+ }
360
+ if err := os .WriteFile ("data/nodes" , nodes , 0666 ); err != nil {
361
+ return err
362
+ }
363
+
364
+ var children []byte
365
+ for _ , c := range childrenEncoding {
366
+ children = binary .BigEndian .AppendUint32 (children , c )
367
+ }
368
+ if err := os .WriteFile ("data/children" , children , 0666 ); err != nil {
369
+ return err
370
+ }
371
+
372
+ return nil
373
+ }
374
+
375
+ func appendNodeEncoding (b []byte , n * node ) []byte {
376
+ encoding := labelEncoding [n .label ]
377
+ if n .icann {
378
+ encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset )
379
+ }
380
+ encoding |= uint64 (n .childrenIndex ) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN )
381
+ for i := nodesBits - 8 ; i >= 0 ; i -= 8 {
382
+ b = append (b , byte ((encoding >> i )& 0xff ))
383
+ }
384
+ return b
385
+ }
386
+
387
+ func printMetadata (w io.Writer , n * node ) error {
318
388
const header = `// generated by go run gen.go; DO NOT EDIT
319
389
320
390
package publicsuffix
321
391
392
+ import _ "embed"
393
+
322
394
const version = %q
323
395
324
396
const (
@@ -343,74 +415,36 @@ const (
343
415
// numTLD is the number of top level domains.
344
416
const numTLD = %d
345
417
418
+ // text is the combined text of all labels.
419
+ //
420
+ //go:embed data/text
421
+ var text string
422
+
346
423
`
347
424
fmt .Fprintf (w , header , * version ,
348
425
nodesBits ,
349
426
nodesBitsChildren , nodesBitsICANN , nodesBitsTextOffset , nodesBitsTextLength ,
350
427
childrenBitsWildcard , childrenBitsNodeType , childrenBitsHi , childrenBitsLo ,
351
428
nodeTypeNormal , nodeTypeException , nodeTypeParentOnly , len (n .children ))
352
-
353
- text := combineText (labelsList )
354
- if text == "" {
355
- return fmt .Errorf ("internal error: makeText returned no text" )
356
- }
357
- for _ , label := range labelsList {
358
- offset , length := strings .Index (text , label ), len (label )
359
- if offset < 0 {
360
- return fmt .Errorf ("internal error: could not find %q in text %q" , label , text )
361
- }
362
- maxTextOffset , maxTextLength = max (maxTextOffset , offset ), max (maxTextLength , length )
363
- if offset >= 1 << nodesBitsTextOffset {
364
- return fmt .Errorf ("text offset %d is too large, or nodeBitsTextOffset is too small" , offset )
365
- }
366
- if length >= 1 << nodesBitsTextLength {
367
- return fmt .Errorf ("text length %d is too large, or nodeBitsTextLength is too small" , length )
368
- }
369
- labelEncoding [label ] = uint64 (offset )<< nodesBitsTextLength | uint64 (length )
370
- }
371
- fmt .Fprintf (w , "// Text is the combined text of all labels.\n const text = " )
372
- for len (text ) > 0 {
373
- n , plus := len (text ), ""
374
- if n > 64 {
375
- n , plus = 64 , " +"
376
- }
377
- fmt .Fprintf (w , "%q%s\n " , text [:n ], plus )
378
- text = text [n :]
379
- }
380
-
381
- if err := n .walk (w , assignIndexes ); err != nil {
382
- return err
383
- }
384
-
385
429
fmt .Fprintf (w , `
386
-
387
430
// nodes is the list of nodes. Each node is represented as a %v-bit integer,
388
431
// which encodes the node's children, wildcard bit and node type (as an index
389
432
// into the children array), ICANN bit and text.
390
433
//
391
- // If the table was generated with the -comments flag, there is a //-comment
392
- // after each node's data. In it is the nodes-array indexes of the children,
393
- // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
394
- // nodeType is printed as + for normal, ! for exception, and o for parent-only
395
- // nodes that have children but don't match a domain label in their own right.
396
- // An I denotes an ICANN domain.
397
- //
398
434
// The layout within the node, from MSB to LSB, is:
399
435
// [%2d bits] unused
400
436
// [%2d bits] children index
401
437
// [%2d bits] ICANN bit
402
438
// [%2d bits] text index
403
439
// [%2d bits] text length
404
- var nodes = [...]uint8{
440
+ //
441
+ //go:embed data/nodes
442
+ var nodes uint40String
405
443
` ,
406
444
nodesBits ,
407
445
nodesBits - nodesBitsChildren - nodesBitsICANN - nodesBitsTextOffset - nodesBitsTextLength ,
408
446
nodesBitsChildren , nodesBitsICANN , nodesBitsTextOffset , nodesBitsTextLength )
409
- if err := n .walk (w , printNode ); err != nil {
410
- return err
411
- }
412
- fmt .Fprintf (w , `}
413
-
447
+ fmt .Fprintf (w , `
414
448
// children is the list of nodes' children, the parent's wildcard bit and the
415
449
// parent's node type. If a node has no children then their children index
416
450
// will be in the range [0, 6), depending on the wildcard bit and node type.
@@ -421,27 +455,13 @@ var nodes = [...]uint8{
421
455
// [%2d bits] node type
422
456
// [%2d bits] high nodes index (exclusive) of children
423
457
// [%2d bits] low nodes index (inclusive) of children
424
- var children=[...]uint32{
458
+ //
459
+ //go:embed data/children
460
+ var children uint32String
425
461
` ,
426
462
32 - childrenBitsWildcard - childrenBitsNodeType - childrenBitsHi - childrenBitsLo ,
427
463
childrenBitsWildcard , childrenBitsNodeType , childrenBitsHi , childrenBitsLo )
428
- for i , c := range childrenEncoding {
429
- s := "---------------"
430
- lo := c & (1 << childrenBitsLo - 1 )
431
- hi := (c >> childrenBitsLo ) & (1 << childrenBitsHi - 1 )
432
- if lo != hi {
433
- s = fmt .Sprintf ("n0x%04x-n0x%04x" , lo , hi )
434
- }
435
- nodeType := int (c >> (childrenBitsLo + childrenBitsHi )) & (1 << childrenBitsNodeType - 1 )
436
- wildcard := c >> (childrenBitsLo + childrenBitsHi + childrenBitsNodeType ) != 0
437
- if * comments {
438
- fmt .Fprintf (w , "0x%08x, // c0x%04x (%s)%s %s\n " ,
439
- c , i , s , wildcardStr (wildcard ), nodeTypeStr (nodeType ))
440
- } else {
441
- fmt .Fprintf (w , "0x%x,\n " , c )
442
- }
443
- }
444
- fmt .Fprintf (w , "}\n \n " )
464
+
445
465
fmt .Fprintf (w , "// max children %d (capacity %d)\n " , maxChildren , 1 << nodesBitsChildren - 1 )
446
466
fmt .Fprintf (w , "// max text offset %d (capacity %d)\n " , maxTextOffset , 1 << nodesBitsTextOffset - 1 )
447
467
fmt .Fprintf (w , "// max text length %d (capacity %d)\n " , maxTextLength , 1 << nodesBitsTextLength - 1 )
@@ -465,12 +485,12 @@ type node struct {
465
485
children []* node
466
486
}
467
487
468
- func (n * node ) walk (w io. Writer , f func (w1 io. Writer , n1 * node ) error ) error {
469
- if err := f (w , n ); err != nil {
488
+ func (n * node ) walk (f func (* node ) error ) error {
489
+ if err := f (n ); err != nil {
470
490
return err
471
491
}
472
492
for _ , c := range n .children {
473
- if err := c .walk (w , f ); err != nil {
493
+ if err := c .walk (f ); err != nil {
474
494
return err
475
495
}
476
496
}
@@ -516,7 +536,7 @@ var childrenEncoding = []uint32{
516
536
517
537
var firstCallToAssignIndexes = true
518
538
519
- func assignIndexes (w io. Writer , n * node ) error {
539
+ func assignIndexes (n * node ) error {
520
540
if len (n .children ) != 0 {
521
541
// Assign nodesIndex.
522
542
n .firstChild = nextNodesIndex
@@ -561,32 +581,6 @@ func assignIndexes(w io.Writer, n *node) error {
561
581
return nil
562
582
}
563
583
564
- func printNode (w io.Writer , n * node ) error {
565
- for _ , c := range n .children {
566
- s := "---------------"
567
- if len (c .children ) != 0 {
568
- s = fmt .Sprintf ("n0x%04x-n0x%04x" , c .firstChild , c .firstChild + len (c .children ))
569
- }
570
- encoding := labelEncoding [c .label ]
571
- if c .icann {
572
- encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset )
573
- }
574
- encoding |= uint64 (c .childrenIndex ) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN )
575
- for i := nodesBits - 8 ; i >= 0 ; i -= 8 {
576
- fmt .Fprintf (w , "0x%02x, " , (encoding >> i )& 0xff )
577
- }
578
- if * comments {
579
- fmt .Fprintf (w , "// n0x%04x c0x%04x (%s)%s %s %s %s\n " ,
580
- c .nodesIndex , c .childrenIndex , s , wildcardStr (c .wildcard ),
581
- nodeTypeStr (c .nodeType ), icannStr (c .icann ), c .label ,
582
- )
583
- } else {
584
- fmt .Fprintf (w , "\n " )
585
- }
586
- }
587
- return nil
588
- }
589
-
590
584
func printNodeLabel (w io.Writer , n * node ) error {
591
585
for _ , c := range n .children {
592
586
fmt .Fprintf (w , "%q,\n " , c .label )
0 commit comments