Skip to content

Commit 9bc69ff

Browse files
lunnyzeripath
andauthored
Support elastic search for code search (#10273)
* Support elastic search for code search * Finished elastic search implementation and add some tests * Enable test on drone and added docs * Add new fields to elastic search * Fix bug * remove unused changes * Use indexer alias to keep the gitea indexer version * Improve codes * Some code improvements * The real indexer name changed to xxx.v1 Co-authored-by: zeripath <[email protected]>
1 parent d257485 commit 9bc69ff

File tree

14 files changed

+694
-164
lines changed

14 files changed

+694
-164
lines changed

.drone.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ steps:
209209
TAGS: bindata
210210
TEST_LDAP: 1
211211
USE_REPO_TEST_DIR: 1
212+
TEST_INDEXER_CODE_ES_URL: "http://elastic:changeme@elasticsearch:9200"
212213
depends_on:
213214
- build
214215

custom/conf/app.example.ini

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,15 @@ STARTUP_TIMEOUT=30s
428428

429429
; repo indexer by default disabled, since it uses a lot of disk space
430430
REPO_INDEXER_ENABLED = false
431+
; Code search engine type, could be `bleve` or `elasticsearch`.
432+
REPO_INDEXER_TYPE = bleve
433+
; Index file used for code search.
431434
REPO_INDEXER_PATH = indexers/repos.bleve
435+
; Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
436+
REPO_INDEXER_CONN_STR =
437+
; Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch
438+
REPO_INDEXER_NAME = gitea_codes
439+
432440
UPDATE_BUFFER_LEN = 20
433441
MAX_FILE_SIZE = 1048576
434442
; A comma separated list of glob patterns (see https://github.com/gobwas/glob) to include

docs/content/doc/advanced/config-cheat-sheet.en-us.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,11 @@ relation to port exhaustion.
270270
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: Batch queue number.
271271

272272
- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
273+
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
273274
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
275+
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
276+
- `REPO_INDEXER_NAME`: **gitea_codes**: Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch
277+
274278
- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
275279
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
276280
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.

docs/content/doc/advanced/config-cheat-sheet.zh-cn.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,12 @@ menu:
9898
- `ISSUE_INDEXER_QUEUE_CONN_STR`: **addrs=127.0.0.1:6379 db=0**: 当 `ISSUE_INDEXER_QUEUE_TYPE``redis` 时,保存Redis队列的连接字符串。
9999
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: 队列处理中批量提交数量。
100100

101-
- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间)。
101+
- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间,如果是bleve可能需要占用约6倍存储空间)。
102+
- `REPO_INDEXER_TYPE`: **bleve**: 代码搜索引擎类型,可以为 `bleve` 或者 `elasticsearch`
102103
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: 用于代码搜索的索引文件路径。
104+
- `REPO_INDEXER_CONN_STR`: ****: 代码搜索引擎连接字符串,当 `REPO_INDEXER_TYPE``elasticsearch` 时有效。例如: http://elastic:changeme@localhost:9200
105+
- `REPO_INDEXER_NAME`: **gitea_codes**: 代码搜索引擎的名字,当 `REPO_INDEXER_TYPE``elasticsearch` 时有效。
106+
103107
- `UPDATE_BUFFER_LEN`: **20**: 代码索引请求的缓冲区长度。
104108
- `MAX_FILE_SIZE`: **1048576**: 进行解析的源代码文件的最大长度,小于该值时才会索引。
105109

modules/indexer/code/bleve.go

Lines changed: 51 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
5858
})
5959
}
6060

61-
// openIndexer open the index at the specified path, checking for metadata
61+
// openBleveIndexer open the index at the specified path, checking for metadata
6262
// updates and bleve version updates. If index needs to be created (or
6363
// re-created), returns (nil, nil)
64-
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
64+
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
6565
_, err := os.Stat(path)
6666
if err != nil && os.IsNotExist(err) {
6767
return nil, nil
@@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
104104
return repoIndexerDocType
105105
}
106106

107-
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
108-
// Ignore vendored files in code search
109-
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
110-
return nil
111-
}
112-
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
113-
RunInDir(repo.RepoPath())
114-
if err != nil {
115-
return err
116-
}
117-
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
118-
return fmt.Errorf("Misformatted git cat-file output: %v", err)
119-
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
120-
return addDelete(update.Filename, repo, batch)
121-
}
122-
123-
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
124-
RunInDirBytes(repo.RepoPath())
125-
if err != nil {
126-
return err
127-
} else if !base.IsTextFile(fileContents) {
128-
// FIXME: UTF-16 files will probably fail here
129-
return nil
130-
}
131-
132-
id := filenameIndexerID(repo.ID, update.Filename)
133-
return batch.Index(id, &RepoIndexerData{
134-
RepoID: repo.ID,
135-
CommitID: commitSha,
136-
Content: string(charset.ToUTF8DropErrors(fileContents)),
137-
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
138-
UpdatedAt: time.Now().UTC(),
139-
})
140-
}
141-
142-
func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
143-
id := filenameIndexerID(repo.ID, filename)
144-
return batch.Delete(id)
145-
}
146-
147107
const (
148108
repoIndexerAnalyzer = "repoIndexerAnalyzer"
149109
repoIndexerDocType = "repoIndexerDocType"
150110
repoIndexerLatestVersion = 5
151111
)
152112

153-
// createRepoIndexer create a repo indexer if one does not already exist
154-
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
113+
// createBleveIndexer create a bleve repo indexer if one does not already exist
114+
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
155115
docMapping := bleve.NewDocumentMapping()
156116
numericFieldMapping := bleve.NewNumericFieldMapping()
157117
numericFieldMapping.IncludeInAll = false
@@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
199159
return indexer, nil
200160
}
201161

202-
func filenameIndexerID(repoID int64, filename string) string {
203-
return indexerID(repoID) + "_" + filename
204-
}
205-
206-
func filenameOfIndexerID(indexerID string) string {
207-
index := strings.IndexByte(indexerID, '_')
208-
if index == -1 {
209-
log.Error("Unexpected ID in repo indexer: %s", indexerID)
210-
}
211-
return indexerID[index+1:]
212-
}
213-
214162
var (
215163
_ Indexer = &BleveIndexer{}
216164
)
@@ -230,18 +178,59 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
230178
return indexer, created, err
231179
}
232180

181+
func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
182+
// Ignore vendored files in code search
183+
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
184+
return nil
185+
}
186+
187+
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
188+
RunInDir(repo.RepoPath())
189+
if err != nil {
190+
return err
191+
}
192+
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
193+
return fmt.Errorf("Misformatted git cat-file output: %v", err)
194+
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
195+
return b.addDelete(update.Filename, repo, batch)
196+
}
197+
198+
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
199+
RunInDirBytes(repo.RepoPath())
200+
if err != nil {
201+
return err
202+
} else if !base.IsTextFile(fileContents) {
203+
// FIXME: UTF-16 files will probably fail here
204+
return nil
205+
}
206+
207+
id := filenameIndexerID(repo.ID, update.Filename)
208+
return batch.Index(id, &RepoIndexerData{
209+
RepoID: repo.ID,
210+
CommitID: commitSha,
211+
Content: string(charset.ToUTF8DropErrors(fileContents)),
212+
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
213+
UpdatedAt: time.Now().UTC(),
214+
})
215+
}
216+
217+
func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
218+
id := filenameIndexerID(repo.ID, filename)
219+
return batch.Delete(id)
220+
}
221+
233222
// init init the indexer
234223
func (b *BleveIndexer) init() (bool, error) {
235224
var err error
236-
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
225+
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
237226
if err != nil {
238227
return false, err
239228
}
240229
if b.indexer != nil {
241230
return false, nil
242231
}
243232

244-
b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
233+
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
245234
if err != nil {
246235
return false, err
247236
}
@@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
262251
}
263252

264253
// Index indexes the data
265-
func (b *BleveIndexer) Index(repoID int64) error {
266-
repo, err := models.GetRepositoryByID(repoID)
267-
if err != nil {
268-
return err
269-
}
270-
271-
sha, err := getDefaultBranchSha(repo)
272-
if err != nil {
273-
return err
274-
}
275-
changes, err := getRepoChanges(repo, sha)
276-
if err != nil {
277-
return err
278-
} else if changes == nil {
279-
return nil
280-
}
281-
254+
func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
282255
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
283256
for _, update := range changes.Updates {
284-
if err := addUpdate(sha, update, repo, batch); err != nil {
257+
if err := b.addUpdate(sha, update, repo, batch); err != nil {
285258
return err
286259
}
287260
}
288261
for _, filename := range changes.RemovedFilenames {
289-
if err := addDelete(filename, repo, batch); err != nil {
262+
if err := b.addDelete(filename, repo, batch); err != nil {
290263
return err
291264
}
292265
}
293-
if err = batch.Flush(); err != nil {
294-
return err
295-
}
296-
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
266+
return batch.Flush()
297267
}
298268

299269
// Delete deletes indexes by ids

modules/indexer/code/bleve_test.go

Lines changed: 3 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,15 @@ package code
66

77
import (
88
"io/ioutil"
9-
"path/filepath"
109
"testing"
1110

1211
"code.gitea.io/gitea/models"
13-
"code.gitea.io/gitea/modules/setting"
1412
"code.gitea.io/gitea/modules/util"
1513

1614
"github.com/stretchr/testify/assert"
1715
)
1816

19-
func TestMain(m *testing.M) {
20-
models.MainTest(m, filepath.Join("..", "..", ".."))
21-
}
22-
23-
func TestIndexAndSearch(t *testing.T) {
17+
func TestBleveIndexAndSearch(t *testing.T) {
2418
models.PrepareTestEnv(t)
2519

2620
dir, err := ioutil.TempDir("", "bleve.index")
@@ -31,56 +25,15 @@ func TestIndexAndSearch(t *testing.T) {
3125
}
3226
defer util.RemoveAll(dir)
3327

34-
setting.Indexer.RepoIndexerEnabled = true
3528
idx, _, err := NewBleveIndexer(dir)
3629
if err != nil {
37-
assert.Fail(t, "Unable to create indexer Error: %v", err)
30+
assert.Fail(t, "Unable to create bleve indexer Error: %v", err)
3831
if idx != nil {
3932
idx.Close()
4033
}
4134
return
4235
}
4336
defer idx.Close()
4437

45-
err = idx.Index(1)
46-
assert.NoError(t, err)
47-
48-
var (
49-
keywords = []struct {
50-
Keyword string
51-
IDs []int64
52-
Langs int
53-
}{
54-
{
55-
Keyword: "Description",
56-
IDs: []int64{1},
57-
Langs: 1,
58-
},
59-
{
60-
Keyword: "repo1",
61-
IDs: []int64{1},
62-
Langs: 1,
63-
},
64-
{
65-
Keyword: "non-exist",
66-
IDs: []int64{},
67-
Langs: 0,
68-
},
69-
}
70-
)
71-
72-
for _, kw := range keywords {
73-
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
74-
assert.NoError(t, err)
75-
assert.EqualValues(t, len(kw.IDs), total)
76-
77-
assert.NotNil(t, langs)
78-
assert.Len(t, langs, kw.Langs)
79-
80-
var ids = make([]int64, 0, len(res))
81-
for _, hit := range res {
82-
ids = append(ids, hit.RepoID)
83-
}
84-
assert.EqualValues(t, kw.IDs, ids)
85-
}
38+
testIndexer("beleve", t, idx)
8639
}

0 commit comments

Comments
 (0)