From 845a0108db85ba4bb29f3bd884826cd27f9307ee Mon Sep 17 00:00:00 2001 From: Patricio Whittingslow Date: Fri, 28 Nov 2025 18:37:50 -0300 Subject: [PATCH 1/5] go/token: replace map with array for looking up keywords array access has considerable less overhead than map access thus yielding benefits in performance and package initialization. --- src/go/token/token.go | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/go/token/token.go b/src/go/token/token.go index aa5d6e02a6f287..6ab4ee655d914a 100644 --- a/src/go/token/token.go +++ b/src/go/token/token.go @@ -279,19 +279,35 @@ func (op Token) Precedence() int { return LowestPrec } -var keywords map[string]Token +var keywords [256]Token func init() { - keywords = make(map[string]Token, keyword_end-(keyword_beg+1)) for i := keyword_beg + 1; i < keyword_end; i++ { - keywords[tokens[i]] = i + keywords[keywordsIndex(i.String())] = i } } +// keywordsIndex maps an identifier to an index in keywords array. +func keywordsIndex(maybeKeyword string) uint8 { + if len(maybeKeyword) <= 3 { + if len(maybeKeyword) == 0 { + return 0 + } + return maybeKeyword[0] + } + v0 := maybeKeyword[0] + v1 := maybeKeyword[1] + v2 := maybeKeyword[2] + v3 := maybeKeyword[3] + h := v0 + v1*8 + v2 - v3 + return h +} + // Lookup maps an identifier to its keyword token or [IDENT] (if not a keyword). func Lookup(ident string) Token { - if tok, is_keyword := keywords[ident]; is_keyword { - return tok + maybeMatch := keywords[keywordsIndex(ident)] + if maybeMatch != 0 && maybeMatch.String() == ident { + return maybeMatch } return IDENT } @@ -319,10 +335,9 @@ func IsExported(name string) bool { } // IsKeyword reports whether name is a Go keyword, such as "func" or "return". -func IsKeyword(name string) bool { - // TODO: opt: use a perfect hash function instead of a global map. - _, ok := keywords[name] - return ok +func IsKeyword(ident string) bool { + tok := keywords[keywordsIndex(ident)] + return tok != 0 && tok.String() == ident } // IsIdentifier reports whether name is a Go identifier, that is, a non-empty From 087361512fa461419406e44ab82bd94f26b1817f Mon Sep 17 00:00:00 2001 From: Patricio Whittingslow Date: Sun, 30 Nov 2025 18:34:38 -0300 Subject: [PATCH 2/5] add documentation on how hashing works --- src/go/token/token.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/go/token/token.go b/src/go/token/token.go index 6ab4ee655d914a..7a92d623da70ac 100644 --- a/src/go/token/token.go +++ b/src/go/token/token.go @@ -290,11 +290,25 @@ func init() { // keywordsIndex maps an identifier to an index in keywords array. func keywordsIndex(maybeKeyword string) uint8 { if len(maybeKeyword) <= 3 { + // If adding a 2 or 3 letter keyword that starts with `i`(if),`f`(for) or `g`(go) + // you'd need to add logic to this if statement to differentiate between them. if len(maybeKeyword) == 0 { return 0 } return maybeKeyword[0] } + // This hash was adjusted by hand. Finding the working combinations + // for this hash is quite straightforward, even when restricting all + // operations to power-of-two multiplications and addition/subtractions + // for performance reasons since multiplication of an integer by a power-of-two + // can be optimized to a bitshift which is faster on some architectures. + // + // Here is a list of hashes that also works for current keyword set: + // h = v0 + v1*2 + v2*4 + v3*8 + // h = v0 + v1*4 + v2*8 + v3 + // h = v0 + v1*2 + (v2+v3)*2 + // h = v0*4 + v1*2 + v2*2 + v3*2 + // h = v0*4 + v1*2 + v2*v3 v0 := maybeKeyword[0] v1 := maybeKeyword[1] v2 := maybeKeyword[2] From dbbeccc0ed76e2f0eda8c1272ce8e1fdafe68f48 Mon Sep 17 00:00:00 2001 From: Patricio Whittingslow Date: Wed, 3 Dec 2025 12:32:44 -0300 Subject: [PATCH 3/5] add benchmark to compare keyword proving --- .../compile/internal/syntax/parser_test.go | 58 ++++++++++++++++++- src/cmd/compile/internal/syntax/scanner.go | 41 ++++++++++++- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/src/cmd/compile/internal/syntax/parser_test.go b/src/cmd/compile/internal/syntax/parser_test.go index b6c4b8fd5693d1..47c8228d13a050 100644 --- a/src/cmd/compile/internal/syntax/parser_test.go +++ b/src/cmd/compile/internal/syntax/parser_test.go @@ -38,6 +38,62 @@ func TestVerify(t *testing.T) { verifyPrint(t, *src_, ast) } +func BenchmarkParseStdLib(b *testing.B) { + if testing.Short() { + b.Skip("skipping test in short mode") + } + var skipRx *regexp.Regexp + if *skip != "" { + var err error + skipRx, err = regexp.Compile(*skip) + if err != nil { + b.Fatalf("invalid argument for -skip (%v)", err) + } + } + // We read in all files to ignore + type file struct { + name string + base *PosBase + data []byte + } + var largestfile *file + var files []file + goroot := testenv.GOROOT(b) + dirs := []string{ + filepath.Join(goroot, "src"), + filepath.Join(goroot, "misc"), + } + for _, dir := range dirs { + walkDirs(b, dir, func(filename string) { + if skipRx != nil && skipRx.MatchString(filename) { + // Always report skipped files since regexp + // typos can lead to surprising results. + fmt.Printf("skipping %s\n", filename) + return + } + data, err := os.ReadFile(filename) + if err != nil { + b.Fatal(err) + } + files = append(files, file{ + name: filename, + data: data, + base: NewFileBase(filename), + }) + f := &files[len(files)-1] + if largestfile == nil || len(f.data) > len(largestfile.data) { + largestfile = f + } + }) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + var buf bytes.Reader + buf.Reset(largestfile.data) + Parse(largestfile.base, &buf, nil, nil, 0) + } +} + func TestStdLib(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode") @@ -123,7 +179,7 @@ func TestStdLib(t *testing.T) { fmt.Printf("allocated %.3fMb (%.3fMb/s)\n", dm, dm/dt.Seconds()) } -func walkDirs(t *testing.T, dir string, action func(string)) { +func walkDirs(t testing.TB, dir string, action func(string)) { entries, err := os.ReadDir(dir) if err != nil { t.Error(err) diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go index 700908f6bda28a..860f5211710a6c 100644 --- a/src/cmd/compile/internal/syntax/scanner.go +++ b/src/cmd/compile/internal/syntax/scanner.go @@ -381,7 +381,10 @@ func (s *scanner) ident() { // possibly a keyword lit := s.segment() if len(lit) >= 2 { - if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) { + // tok := keywordMap[hash(lit)] + // tok := keywords[keywordsIndex(lit)] + tok := keywordRuntimeMap[string(lit)] + if tok != 0 && tokStrFast(tok) == string(lit) { s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) s.tok = tok return @@ -422,15 +425,47 @@ func hash(s []byte) uint { } var keywordMap [1 << 6]token // size must be power of two +var keywordRuntimeMap = make(map[string]token) +var keywords [256]Token + +// keywordsIndex maps an identifier to an index in keywords array. +func keywordsIndex(maybeKeyword []byte) uint8 { + if len(maybeKeyword) <= 3 { + return maybeKeyword[0] + } + // This hash was adjusted by hand. Finding the working combinations + // for this hash is quite straightforward, even when restricting all + // operations to power-of-two multiplications and addition/subtractions + // for performance reasons since multiplication of an integer by a power-of-two + // can be optimized to a bitshift which is faster on some architectures. + // + // Here is a list of hashes that also works for current keyword set: + // h = v0 + v1*2 + v2*4 + v3*8 + // h = v0 + v1*4 + v2*8 + v3 + // h = v0 + v1*2 + (v2+v3)*2 + // h = v0*4 + v1*2 + v2*2 + v3*2 + // h = v0*4 + v1*2 + v2*v3 + v0 := maybeKeyword[0] + v1 := maybeKeyword[1] + v2 := maybeKeyword[2] + v3 := maybeKeyword[3] + h := v0 + v1*8 + v2 - v3 + return h +} func init() { // populate keywordMap for tok := _Break; tok <= _Var; tok++ { - h := hash([]byte(tok.String())) - if keywordMap[h] != 0 { + kws := tok.String() + kw := []byte(kws) + i := keywordsIndex(kw) + h := hash(kw) + if keywordMap[h] != 0 || keywords[i] != 0 { panic("imperfect hash") } + keywords[i] = tok keywordMap[h] = tok + keywordRuntimeMap[kws] = tok } } From 64baebd035b55c272cdd99e5ed069dd51f0b94d6 Mon Sep 17 00:00:00 2001 From: Patricio Whittingslow Date: Wed, 3 Dec 2025 14:33:02 -0300 Subject: [PATCH 4/5] use syntax package hash --- .../compile/internal/syntax/parser_test.go | 35 +++++++++--- src/cmd/compile/internal/syntax/scanner.go | 41 +------------- src/go/token/token.go | 55 +++++++------------ 3 files changed, 50 insertions(+), 81 deletions(-) diff --git a/src/cmd/compile/internal/syntax/parser_test.go b/src/cmd/compile/internal/syntax/parser_test.go index 47c8228d13a050..cdf87a4fe9ecce 100644 --- a/src/cmd/compile/internal/syntax/parser_test.go +++ b/src/cmd/compile/internal/syntax/parser_test.go @@ -13,6 +13,7 @@ import ( "path/filepath" "regexp" "runtime" + "slices" "strings" "sync" "testing" @@ -56,7 +57,6 @@ func BenchmarkParseStdLib(b *testing.B) { base *PosBase data []byte } - var largestfile *file var files []file goroot := testenv.GOROOT(b) dirs := []string{ @@ -80,18 +80,35 @@ func BenchmarkParseStdLib(b *testing.B) { data: data, base: NewFileBase(filename), }) - f := &files[len(files)-1] - if largestfile == nil || len(f.data) > len(largestfile.data) { - largestfile = f - } }) } + slices.SortStableFunc(files, func(a, b file) int { + return len(a.data) - len(b.data) + }) b.ResetTimer() - for i := 0; i < b.N; i++ { - var buf bytes.Reader - buf.Reset(largestfile.data) - Parse(largestfile.base, &buf, nil, nil, 0) + const numberOfFiles = 10 + if len(files) < numberOfFiles*2 { + b.Error("too few files matched to run") } + b.Run(fmt.Sprintf("longest %d files", numberOfFiles), func(b *testing.B) { + var buf bytes.Reader + for i := 0; i < b.N; i++ { + for _, file := range files[len(files)-numberOfFiles:] { + buf.Reset(file.data) + Parse(file.base, &buf, nil, nil, 0) + } + } + }) + + b.Run(fmt.Sprintf("shortest %d files", numberOfFiles), func(b *testing.B) { + var buf bytes.Reader + for i := 0; i < b.N; i++ { + for _, file := range files[:numberOfFiles] { + buf.Reset(file.data) + Parse(file.base, &buf, nil, nil, 0) + } + } + }) } func TestStdLib(t *testing.T) { diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go index 860f5211710a6c..700908f6bda28a 100644 --- a/src/cmd/compile/internal/syntax/scanner.go +++ b/src/cmd/compile/internal/syntax/scanner.go @@ -381,10 +381,7 @@ func (s *scanner) ident() { // possibly a keyword lit := s.segment() if len(lit) >= 2 { - // tok := keywordMap[hash(lit)] - // tok := keywords[keywordsIndex(lit)] - tok := keywordRuntimeMap[string(lit)] - if tok != 0 && tokStrFast(tok) == string(lit) { + if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) { s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) s.tok = tok return @@ -425,47 +422,15 @@ func hash(s []byte) uint { } var keywordMap [1 << 6]token // size must be power of two -var keywordRuntimeMap = make(map[string]token) -var keywords [256]Token - -// keywordsIndex maps an identifier to an index in keywords array. -func keywordsIndex(maybeKeyword []byte) uint8 { - if len(maybeKeyword) <= 3 { - return maybeKeyword[0] - } - // This hash was adjusted by hand. Finding the working combinations - // for this hash is quite straightforward, even when restricting all - // operations to power-of-two multiplications and addition/subtractions - // for performance reasons since multiplication of an integer by a power-of-two - // can be optimized to a bitshift which is faster on some architectures. - // - // Here is a list of hashes that also works for current keyword set: - // h = v0 + v1*2 + v2*4 + v3*8 - // h = v0 + v1*4 + v2*8 + v3 - // h = v0 + v1*2 + (v2+v3)*2 - // h = v0*4 + v1*2 + v2*2 + v3*2 - // h = v0*4 + v1*2 + v2*v3 - v0 := maybeKeyword[0] - v1 := maybeKeyword[1] - v2 := maybeKeyword[2] - v3 := maybeKeyword[3] - h := v0 + v1*8 + v2 - v3 - return h -} func init() { // populate keywordMap for tok := _Break; tok <= _Var; tok++ { - kws := tok.String() - kw := []byte(kws) - i := keywordsIndex(kw) - h := hash(kw) - if keywordMap[h] != 0 || keywords[i] != 0 { + h := hash([]byte(tok.String())) + if keywordMap[h] != 0 { panic("imperfect hash") } - keywords[i] = tok keywordMap[h] = tok - keywordRuntimeMap[kws] = tok } } diff --git a/src/go/token/token.go b/src/go/token/token.go index 7a92d623da70ac..cba0222454e406 100644 --- a/src/go/token/token.go +++ b/src/go/token/token.go @@ -279,47 +279,31 @@ func (op Token) Precedence() int { return LowestPrec } -var keywords [256]Token - -func init() { - for i := keyword_beg + 1; i < keyword_end; i++ { - keywords[keywordsIndex(i.String())] = i - } +// hash is a perfect hash function for keywords. +// It assumes that s has at least length 2. +func hash(s string) uint { + return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) } -// keywordsIndex maps an identifier to an index in keywords array. -func keywordsIndex(maybeKeyword string) uint8 { - if len(maybeKeyword) <= 3 { - // If adding a 2 or 3 letter keyword that starts with `i`(if),`f`(for) or `g`(go) - // you'd need to add logic to this if statement to differentiate between them. - if len(maybeKeyword) == 0 { - return 0 +var keywordMap [1 << 6]Token // size must be power of two + +func init() { + // populate keywordMap + for tok := keyword_beg + 1; tok < keyword_end; tok++ { + h := hash(tok.String()) + if keywordMap[h] != 0 { + panic("imperfect hash") } - return maybeKeyword[0] + keywordMap[h] = tok } - // This hash was adjusted by hand. Finding the working combinations - // for this hash is quite straightforward, even when restricting all - // operations to power-of-two multiplications and addition/subtractions - // for performance reasons since multiplication of an integer by a power-of-two - // can be optimized to a bitshift which is faster on some architectures. - // - // Here is a list of hashes that also works for current keyword set: - // h = v0 + v1*2 + v2*4 + v3*8 - // h = v0 + v1*4 + v2*8 + v3 - // h = v0 + v1*2 + (v2+v3)*2 - // h = v0*4 + v1*2 + v2*2 + v3*2 - // h = v0*4 + v1*2 + v2*v3 - v0 := maybeKeyword[0] - v1 := maybeKeyword[1] - v2 := maybeKeyword[2] - v3 := maybeKeyword[3] - h := v0 + v1*8 + v2 - v3 - return h } // Lookup maps an identifier to its keyword token or [IDENT] (if not a keyword). func Lookup(ident string) Token { - maybeMatch := keywords[keywordsIndex(ident)] + if len(ident) < 2 { + return IDENT + } + maybeMatch := keywordMap[hash(ident)] if maybeMatch != 0 && maybeMatch.String() == ident { return maybeMatch } @@ -350,7 +334,10 @@ func IsExported(name string) bool { // IsKeyword reports whether name is a Go keyword, such as "func" or "return". func IsKeyword(ident string) bool { - tok := keywords[keywordsIndex(ident)] + if len(ident) < 2 { + return false + } + tok := keywordMap[hash(ident)] return tok != 0 && tok.String() == ident } From a9a01b075048f09e5b5fa08ab011be2e329d3b69 Mon Sep 17 00:00:00 2001 From: Patricio Whittingslow Date: Fri, 5 Dec 2025 09:48:17 -0300 Subject: [PATCH 5/5] improve stdlib parsing benchmark by not loading all stdlib into memory --- .../compile/internal/syntax/parser_test.go | 30 ++++++++++++++----- src/go/token/token.go | 5 ++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/syntax/parser_test.go b/src/cmd/compile/internal/syntax/parser_test.go index cdf87a4fe9ecce..404317b3c03019 100644 --- a/src/cmd/compile/internal/syntax/parser_test.go +++ b/src/cmd/compile/internal/syntax/parser_test.go @@ -39,6 +39,8 @@ func TestVerify(t *testing.T) { verifyPrint(t, *src_, ast) } +// To run only this benchmark and obtain results for benchstat: +// go test -bench=ParseStdLib -benchtime=5s -run none -count=20 func BenchmarkParseStdLib(b *testing.B) { if testing.Short() { b.Skip("skipping test in short mode") @@ -55,7 +57,8 @@ func BenchmarkParseStdLib(b *testing.B) { type file struct { name string base *PosBase - data []byte + data []byte // data populated only for files being tested. + size int64 } var files []file goroot := testenv.GOROOT(b) @@ -71,25 +74,37 @@ func BenchmarkParseStdLib(b *testing.B) { fmt.Printf("skipping %s\n", filename) return } - data, err := os.ReadFile(filename) + info, err := os.Stat(filename) if err != nil { b.Fatal(err) } files = append(files, file{ name: filename, - data: data, + size: info.Size(), base: NewFileBase(filename), }) }) } - slices.SortStableFunc(files, func(a, b file) int { - return len(a.data) - len(b.data) - }) - b.ResetTimer() const numberOfFiles = 10 if len(files) < numberOfFiles*2 { b.Error("too few files matched to run") } + loadFile := func(f *file) { + var err error + f.data, err = os.ReadFile(f.name) + if err != nil { + b.Fatal(err) + } + } + slices.SortStableFunc(files, func(a, b file) int { + return int(a.size - b.size) + }) + // We load the files we'll be testing into memory to avoid noise introduced by operating system. + for i := 0; i < numberOfFiles; i++ { + loadFile(&files[i]) // Load smallest files. + loadFile(&files[len(files)-i-1]) // Load largest files. + } + b.ResetTimer() b.Run(fmt.Sprintf("longest %d files", numberOfFiles), func(b *testing.B) { var buf bytes.Reader for i := 0; i < b.N; i++ { @@ -99,7 +114,6 @@ func BenchmarkParseStdLib(b *testing.B) { } } }) - b.Run(fmt.Sprintf("shortest %d files", numberOfFiles), func(b *testing.B) { var buf bytes.Reader for i := 0; i < b.N; i++ { diff --git a/src/go/token/token.go b/src/go/token/token.go index cba0222454e406..6835fdd2f4ac13 100644 --- a/src/go/token/token.go +++ b/src/go/token/token.go @@ -282,9 +282,14 @@ func (op Token) Precedence() int { // hash is a perfect hash function for keywords. // It assumes that s has at least length 2. func hash(s string) uint { + // If you get collisions on adding a keyword you'll need to + // process more bytes of the identifier since this'll indicate + // two keywords share the same first two bytes. + // Best course of action is incrementing keyword map size or tuning the hash operations. return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) } +// keywordMap is a perfect map taken from src/cmd/compile/internal/syntax/scanner.go var keywordMap [1 << 6]Token // size must be power of two func init() {