Skip to content

Commit

Permalink
feat: Speed up symbol parsing by minimizing allocations (#258)
Browse files Browse the repository at this point in the history
Changes the symbol parsing logic to minimize allocations. In particular,
when we only care about validating symbols (e.g. during document
canonicalization when ingesting uploads), there is really ~no need to
allocate any strings at all. Validation and parsing share most of the
underlying code -- the only change is we create "writer" types which
will discard writes (and hence any internal buffer growth) when we're
only in validation mode.

For parsing mode, it is now possible to pass in pre-allocated values
which will be overwritten as possible (unless the symbol is very large,
in which case it still needs to allocate new Descriptors).

See PR for benchmarks.
  • Loading branch information
varungandhi-src authored Dec 4, 2024
1 parent cf2bf08 commit 12ff730
Show file tree
Hide file tree
Showing 26 changed files with 3,645 additions and 2,410 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ jobs:
- uses: ./.github/actions/asdf
with:
golang: true
- run: go test ./... -v
- run: go test ./... -v -tags asserts
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ reprolang/target
reprolang/src/grammar.json
reprolang/src/node-types.json
bindings/typescript
docs/scip.md
.bin
2 changes: 1 addition & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
golang 1.20.14
golang 1.22.0
nodejs 16.20.2
shellcheck 0.7.1
yarn 1.22.22
Expand Down
19 changes: 19 additions & 0 deletions Development.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- [Project structure](#project-structure)
- [Code generation](#code-generation)
- [Debugging](#debugging)
- [Benchmarking](#benchmarking)
- [Testing and adding new SCIP semantics](#testing-and-adding-new-scip-semantics)
- [Release a new version](#release-a-new-version)

Expand Down Expand Up @@ -69,6 +70,24 @@ and is not recommended for use in other settings.
scip lint /path/to/index.scip
```

## Benchmarking

For benchmarks, one can put test SCIP indexes under `dev/sample_indexes`.

Sourcegraph teammates can download several large indexes
from this [Google drive folder](https://drive.google.com/drive/folders/1z62Se7eHaa5T89a16-y7s0Z1qbRY4VCg).

After that you can run:

```bash
go run ./bindings/go/scip/speedtest
```

to see the results.

Make sure to share benchmark results when making changes to
the symbol parsing logic.

## Testing and adding new SCIP semantics

It is helpful to use reprolang to check the existing code navigation behavior,
Expand Down
9 changes: 9 additions & 0 deletions bindings/go/scip/assertions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build asserts

package scip

func assert(cond bool, msg string) {
if !cond {
panic(msg)
}
}
6 changes: 6 additions & 0 deletions bindings/go/scip/assertions_noop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//go:build !asserts

package scip

// assert is a noop in release builds - the implementation is in assertions.go
func assert(cond bool, msg string) {}
72 changes: 72 additions & 0 deletions bindings/go/scip/internal/compat_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package internal

import (
"io"
"os"
"path/filepath"
"sync/atomic"
"testing"

"github.com/sourcegraph/beaut"
"github.com/sourcegraph/beaut/lib/knownwf"
conciter "github.com/sourcegraph/conc/iter"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/proto"
)

func TestParseCompat(t *testing.T) {
for _, path := range shared.SampleIndexes() {
t.Run(filepath.Base(path), func(t *testing.T) {
t.Parallel()
scipReader, err := os.Open(path)
require.Nil(t, err)
scipBytes, err := io.ReadAll(scipReader)
require.Nil(t, err)
scipIndex := scip.Index{}
require.NoError(t, proto.Unmarshal(scipBytes, &scipIndex))
var total atomic.Int64
conciter.ForEach(scipIndex.Documents, func(docPtr **scip.Document) {
document := *docPtr
if total.Load() > 1000*1000 {
return
}
total.Add(int64(len(document.Occurrences)))
var newSym scip.Symbol
for i := 0; i < len(document.Occurrences); i++ {
occ := document.Occurrences[i]
oldSym, oldErr := ParsePartialSymbolV1ToBeDeleted(occ.Symbol, true)
var newErr error
require.NotPanics(t, func() {
str := beaut.NewUTF8StringUnchecked(occ.Symbol, knownwf.UTF8DeserializedFromProtobufString)
newErr = scip.ParseSymbolUTF8With(str, scip.ParseSymbolOptions{
IncludeDescriptors: true,
RecordOutput: &newSym,
})
}, "panic for symbol: %q", occ.Symbol)
if oldErr != nil {
require.Error(t, newErr,
"old parser gave error %v but parse was successful with new parser (symbol: %q)",
oldErr.Error(), occ.Symbol)
continue
} else if newErr != nil {
require.NoError(t, newErr,
"new parser gave error %v but parse was successful with old parser (symbol: %q)",
newErr.Error(), occ.Symbol)
}
require.Equal(t, oldSym.Scheme, newSym.Scheme)
require.Equal(t, oldSym.Package, newSym.Package)
require.Equalf(t, len(oldSym.Descriptors), len(newSym.Descriptors), "symbol: %v, d1: %+v, d2: %+v", occ.Symbol,
oldSym.Descriptors, newSym.Descriptors)
for i, d := range oldSym.Descriptors {
dnew := newSym.Descriptors[i]
require.Equal(t, d.Name, dnew.Name, "symbol: %v", occ.Symbol)
require.Equal(t, d.Suffix, dnew.Suffix, "symbol: %v", occ.Symbol)
require.Equal(t, d.Disambiguator, dnew.Disambiguator, "symbol: %v", occ.Symbol)
}
}
})
})
}
}
238 changes: 238 additions & 0 deletions bindings/go/scip/internal/old_symbol_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
package internal

import (
"fmt"
"strings"

"github.com/cockroachdb/errors"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
)

func tryParseLocalSymbol(symbol string) (string, error) {
if !strings.HasPrefix(symbol, "local ") {
return "", nil
}
suffix := symbol[6:]
if len(suffix) > 0 && shared.IsSimpleIdentifier(suffix) {
return suffix, nil
}
return "", errors.Newf("expected format 'local <simple-identifier>' but got: %v", symbol)
}

// ParsePartialSymbolV1ToBeDeleted parses an SCIP string into the Symbol message
// with the option to exclude the `.Descriptor` field.
//
// Nov 30 2024: This is currently only present for benchmarking + compatibility
// reasons. We can remove this in the future once we're confident that the new
// parser handles everything correctly.
func ParsePartialSymbolV1ToBeDeleted(symbol string, includeDescriptors bool) (*scip.Symbol, error) {
local, err := tryParseLocalSymbol(symbol)
if err != nil {
return nil, err
}
if local != "" {
return &scip.Symbol{
Scheme: "local",
Descriptors: []*scip.Descriptor{
{Name: local, Suffix: scip.Descriptor_Local},
},
}, nil
}
s := newSymbolParser(symbol)
scheme, err := s.acceptSpaceEscapedIdentifier("scheme")
if err != nil {
return nil, err
}
manager, err := s.acceptSpaceEscapedIdentifier("package manager")
if err != nil {
return nil, err
}
if manager == "." {
manager = ""
}
packageName, err := s.acceptSpaceEscapedIdentifier("package name")
if err != nil {
return nil, err
}
if packageName == "." {
packageName = ""
}
packageVersion, err := s.acceptSpaceEscapedIdentifier("package version")
if err != nil {
return nil, err
}
if packageVersion == "." {
packageVersion = ""
}
var descriptors []*scip.Descriptor
if includeDescriptors {
descriptors, err = s.parseDescriptors()
}
return &scip.Symbol{
Scheme: scheme,
Package: &scip.Package{
Manager: manager,
Name: packageName,
Version: packageVersion,
},
Descriptors: descriptors,
}, err
}

type symbolParser struct {
Symbol []rune
index int
SymbolString string
}

func newSymbolParser(symbol string) *symbolParser {
return &symbolParser{
SymbolString: symbol,
Symbol: []rune(symbol),
index: 0,
}
}

func (s *symbolParser) error(message string) error {
return errors.Newf("%s\n%s\n%s^", message, s.SymbolString, strings.Repeat("_", s.index))
}

func (s *symbolParser) current() rune {
if s.index < len(s.Symbol) {
return s.Symbol[s.index]
}
return '\x00'
}

func (s *symbolParser) peekNext() rune {
if s.index+1 < len(s.Symbol) {
return s.Symbol[s.index]
}
return 0
}

func (s *symbolParser) parseDescriptors() ([]*scip.Descriptor, error) {
var result []*scip.Descriptor
for s.index < len(s.Symbol) {
descriptor, err := s.parseDescriptor()
if err != nil {
return nil, err
}
result = append(result, descriptor)
}
return result, nil
}

func (s *symbolParser) parseDescriptor() (*scip.Descriptor, error) {
start := s.index
switch s.peekNext() {
case '(':
s.index++
name, err := s.acceptIdentifier("parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Parameter}, s.acceptCharacter(')', "closing parameter name")
case '[':
s.index++
name, err := s.acceptIdentifier("type parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_TypeParameter}, s.acceptCharacter(']', "closing type parameter name")
default:
name, err := s.acceptIdentifier("descriptor name")
if err != nil {
return nil, err
}
suffix := s.current()
s.index++
switch suffix {
case '(':
disambiguator := ""
if s.peekNext() != ')' {
disambiguator, err = s.acceptIdentifier("method disambiguator")
if err != nil {
return nil, err
}
}
err = s.acceptCharacter(')', "closing method")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Disambiguator: disambiguator, Suffix: scip.Descriptor_Method}, s.acceptCharacter('.', "closing method")
case '/':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Namespace}, nil
case '.':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Term}, nil
case '#':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Type}, nil
case ':':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Meta}, nil
case '!':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Macro}, nil
default:
}
}

end := s.index
if s.index > len(s.Symbol) {
end = len(s.Symbol)
}
return nil, errors.Newf("unrecognized descriptor %q", string(s.Symbol[start:end]))
}

func (s *symbolParser) acceptIdentifier(what string) (string, error) {
if s.current() == '`' {
s.index++
return s.acceptBacktickEscapedIdentifier(what)
}
start := s.index
for s.index < len(s.Symbol) && shared.IsSimpleIdentifierCharacter(s.current()) {
s.index++
}
if start == s.index {
return "", s.error("empty identifier")
}
return string(s.Symbol[start:s.index]), nil
}

func (s *symbolParser) acceptSpaceEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, ' ')
}

func (s *symbolParser) acceptBacktickEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, '`')
}

func (s *symbolParser) acceptEscapedIdentifier(what string, escapeCharacter rune) (string, error) {
builder := strings.Builder{}
for s.index < len(s.Symbol) {
ch := s.current()
if ch == escapeCharacter {
s.index++
if s.index >= len(s.Symbol) {
break
}
if s.current() == escapeCharacter {
// Escaped space character.
builder.WriteRune(ch)
} else {
return builder.String(), nil
}
} else {
builder.WriteRune(ch)
}
s.index++
}
return "", s.error(fmt.Sprintf("reached end of symbol while parsing <%s>, expected a '%v' character", what, string(escapeCharacter)))
}

func (s *symbolParser) acceptCharacter(r rune, what string) error {
if s.current() == r {
s.index++
return nil
}
return s.error(fmt.Sprintf("expected '%v', obtained '%v', while parsing %v", string(r), string(s.current()), what))
}
Loading

0 comments on commit 12ff730

Please sign in to comment.