2 Commits

Author SHA1 Message Date
Alex Schroeder
8eb700fb0b Use full text search 2023-09-25 14:07:00 +02:00
Alex Schroeder
7514c2173b Add exact serch from the command line 2023-09-25 09:21:13 +02:00
12 changed files with 323 additions and 71 deletions

View File

@@ -2,7 +2,7 @@ package main
import (
"encoding/json"
"fmt"
"log"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
"io"
@@ -82,7 +82,7 @@ func account(p *parser.Parser, data []byte, offset int) (int, ast.Node) {
uri, ok := accounts.uris[string(account)]
defer accounts.RUnlock()
if !ok {
fmt.Printf("Looking up %s\n", account)
log.Printf("Looking up %s\n", account)
uri = "https://" + string(domain) + "/users/" + string(user[1:])
accounts.uris[string(account)] = uri // prevent more lookings
go lookUpAccountUri(string(account), string(domain))
@@ -103,26 +103,26 @@ func lookUpAccountUri(account, domain string) {
uri := "https://" + domain + "/.well-known/webfinger"
resp, err := http.Get(uri + "?resource=acct:" + account)
if err != nil {
fmt.Printf("Failed to look up %s: %s\n", account, err)
log.Printf("Failed to look up %s: %s", account, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("Failed to read from %s: %s\n", account, err)
log.Printf("Failed to read from %s: %s", account, err)
return
}
var wf WebFinger
err = json.Unmarshal([]byte(body), &wf)
if err != nil {
fmt.Printf("Failed to parse the JSON from %s: %s\n", account, err)
log.Printf("Failed to parse the JSON from %s: %s", account, err)
return
}
uri, err = parseWebFinger(body)
if err != nil {
fmt.Printf("Could not find profile URI for %s: %s\n", account, err)
log.Printf("Could not find profile URI for %s: %s", account, err)
}
fmt.Printf("Found profile for %s: %s\n", account, uri)
log.Printf("Found profile for %s: %s", account, uri)
accounts.Lock()
defer accounts.Unlock()
accounts.uris[account] = uri

View File

@@ -18,7 +18,7 @@ Orange sky above
Reflects a distant fire
It's not `)}
p.save()
data := url.Values{}
data.Set("body", "barbecue")
@@ -29,7 +29,6 @@ It's not `)}
HTTPRedirectTo(t, makeHandler(appendHandler, true), "POST", "/append/testdata/fire", data, "/view/testdata/fire")
assert.Regexp(t, regexp.MustCompile("Its not barbecue"),
assert.HTTPBody(makeHandler(viewHandler, true), "GET", "/view/testdata/fire", nil))
t.Cleanup(func() {
_ = os.RemoveAll("testdata")
})

197
index.go
View File

@@ -1,43 +1,96 @@
// Read Artem Krylysov's blog post on full text search as an
// introduction.
// https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/
package main
import (
trigram "github.com/dgryski/go-trigram"
import(
"io/fs"
"path/filepath"
"log"
"sort"
"strings"
"sync"
)
type docid uint
// Index contains the two maps used for search. Make sure to lock and
// unlock as appropriate.
type Index struct {
sync.RWMutex
// index is a struct containing the trigram index for search.
// It is generated at startup and updated after every page
// edit. The index is case-insensitive.
index trigram.Index
// next_id is the number of the next document added to the index
next_id docid
// documents is a map, mapping document ids of the index to
// page names.
documents map[trigram.DocID]string
// index is an inverted index mapping tokens to document ids.
token map[string][]docid
// names is a map, mapping page names to titles.
// documents is a map, mapping document ids to page names.
documents map[docid]string
// titles is a map, mapping page names to titles.
titles map[string]string
}
// idx is the global Index per wiki.
var index Index
// reset resets the Index. This assumes that the index is locked!
func (idx *Index) reset() {
idx.index = nil
idx.token = nil
idx.documents = nil
idx.titles = nil
}
// addDocument adds the text as a new document. This assumes that the
// index is locked!
func (idx *Index) addDocument(text string) docid {
id := idx.next_id; idx.next_id++
for _, token := range tokens(text) {
ids := idx.token[token]
// Don't add same ID more than once. Checking the last
// position of the []docid works because the id is
// always a new one, i.e. the last one, if at all.
if ids != nil && ids[len(ids)-1] == id {
continue
}
idx.token[token] = append(ids, id)
}
return id
}
// deleteDocument deletes the text as a new document. The id can no
// longer be used. This assumes that the index is locked!
func (idx *Index) deleteDocument(text string, id docid) {
for _, token := range tokens(text) {
ids := index.token[token]
// Tokens can appear multiple times in a text but they
// can only be deleted once. deleted.
if ids == nil {
continue
}
// If the token appears only in this document, remove
// the whole entry.
if len(ids) == 1 && ids[0] == id {
delete(index.token, token)
continue
}
// Otherwise, remove the token from the index.
i := sort.Search(len(ids), func(i int) bool { return ids[i] >= id })
if i != -1 && i < len(ids) && ids[i] == id {
copy(ids[i:], ids[i+1:])
index.token[token] = ids[:len(ids)-1]
continue
}
// If none of the above, then our docid wasn't
// indexed. This shouldn't happen, either.
log.Printf("The index for token %s does not contain doc id %d", token, id)
}
delete(index.documents, id)
}
// add reads a file and adds it to the index. This must happen while
// the idx is locked, which is true when called from loadIndex.
// the idx is locked.
func (idx *Index) add(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
@@ -52,7 +105,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
return err
}
p.handleTitle(false)
id := idx.index.Add(strings.ToLower(string(p.Body)))
id := idx.addDocument(string(p.Body))
idx.documents[id] = p.Name
idx.titles[p.Name] = p.Title
return nil
@@ -63,8 +117,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
func (idx *Index) load() (int, error) {
idx.Lock()
defer idx.Unlock()
idx.index = make(trigram.Index)
idx.documents = make(map[trigram.DocID]string)
idx.token = make(map[string][]docid)
idx.documents = make(map[docid]string)
idx.titles = make(map[string]string)
err := filepath.Walk(".", idx.add)
if err != nil {
@@ -75,15 +129,23 @@ func (idx *Index) load() (int, error) {
return n, nil
}
// dump prints the index to the log for debugging. Must already be readlocked.
func (idx *Index) dump() {
index.RLock()
defer index.RUnlock()
for token, ids := range idx.token {
log.Printf("%s: %v", token, ids)
}
}
// updateIndex updates the index for a single page. The old text is
// loaded from the disk and removed from the index first, if it
// exists.
func (p *Page) updateIndex() {
index.Lock()
defer index.Unlock()
var id trigram.DocID
// This function does not rely on files actually existing, so
// let's quickly find the document id.
var id docid
// Reverse lookup! At least it's in memory.
for docId, name := range index.documents {
if name == p.Name {
id = docId
@@ -91,33 +153,94 @@ func (p *Page) updateIndex() {
}
}
if id == 0 {
id = index.index.Add(strings.ToLower(string(p.Body)))
id = index.addDocument(string(p.Body))
index.documents[id] = p.Name
index.titles[p.Name] = p.Title
} else {
o, err := loadPage(p.Name)
if err == nil {
index.index.Delete(strings.ToLower(string(o.Body)), id)
o.handleTitle(false)
delete(index.titles, o.Title)
if o, err := loadPage(p.Name); err == nil {
index.deleteDocument(string(o.Body), id)
}
index.index.Insert(strings.ToLower(string(p.Body)), id)
// Do not reuse the old id. We need a new one for
// indexing to work.
id = index.addDocument(string(p.Body))
index.documents[id] = p.Name
p.handleTitle(false)
// The page name stays the same but the title may have
// changed.
index.titles[p.Name] = p.Title
}
}
// searchDocuments searches the index for a string. This requires the
// index to be locked.
func searchDocuments(q string) []string {
words := strings.Fields(strings.ToLower(q))
var trigrams []trigram.T
for _, word := range words {
trigrams = trigram.Extract(word, trigrams)
// removeFromIndex removes the page from the index. Do this when
// deleting a page.
func (p *Page) removeFromIndex() {
index.Lock()
defer index.Unlock()
var id docid
// Reverse lookup! At least it's in memory.
for docId, name := range index.documents {
if name == p.Name {
id = docId
break
}
}
ids := index.index.QueryTrigrams(trigrams)
names := make([]string, len(ids))
for i, id := range ids {
names[i] = index.documents[id]
if id == 0 {
log.Printf("Page %s is not indexed", p.Name)
return
}
o, err := loadPage(p.Name)
if err != nil {
log.Printf("Page %s cannot removed from the index: %s", p.Name, err)
return
}
index.deleteDocument(string(o.Body), id)
}
// searchDocuments searches the index for a query string and returns
// page names.
func (idx *Index) search(q string) []string {
index.RLock()
defer index.RUnlock()
var r []docid
for _, token := range tokens(q) {
if ids, ok := idx.token[token]; ok {
if r == nil {
r = ids
} else {
r = intersection(r, ids)
}
} else {
// Token doesn't exist therefore abort search.
return nil
}
}
names := make([]string, 0)
for _, id := range r {
names = append(names, idx.documents[id])
}
return names
}
// intersection returns the set intersection between a and b.
// a and b have to be sorted in ascending order and contain no duplicates.
func intersection(a []docid, b []docid) []docid {
maxLen := len(a)
if len(b) > maxLen {
maxLen = len(b)
}
r := make([]docid, 0, maxLen)
var i, j int
for i < len(a) && j < len(b) {
if a[i] < b[j] {
i++
} else if a[i] > b[j] {
j++
} else {
r = append(r, a[i])
i++
j++
}
}
return r
}

View File

@@ -27,9 +27,10 @@ func TestSearchHashtag(t *testing.T) {
assert.NotZero(t, len(pages))
}
// wipes testdata
func TestIndexUpdates(t *testing.T) {
name := "test"
_ = os.Remove(name + ".md")
_ = os.RemoveAll("testdata")
name := "testdata/test"
index.load()
p := &Page{Name: name, Body: []byte("This is a test.")}
p.save()
@@ -92,6 +93,6 @@ func TestIndexUpdates(t *testing.T) {
assert.True(t, found)
t.Cleanup(func() {
_ = os.Remove(name + ".md")
_ = os.RemoveAll("testdata")
})
}

View File

@@ -2,7 +2,7 @@ package main
import (
"bytes"
"fmt"
"log"
"github.com/microcosm-cc/bluemonday"
"html/template"
"net/url"
@@ -60,6 +60,7 @@ func (p *Page) save() error {
filename := p.Name + ".md"
s := bytes.ReplaceAll(p.Body, []byte{'\r'}, []byte{})
if len(s) == 0 {
p.removeFromIndex()
_ = os.Rename(filename, filename+"~")
return os.Remove(filename)
}
@@ -69,7 +70,7 @@ func (p *Page) save() error {
if d != "." {
err := os.MkdirAll(d, 0755)
if err != nil {
fmt.Printf("Creating directory %s failed", d)
log.Printf("Creating directory %s failed: %s", d, err)
return err
}
}

View File

@@ -1,7 +1,7 @@
package main
import (
"fmt"
"log"
"net/http"
"slices"
"strconv"
@@ -70,7 +70,7 @@ func load(names []string) []*Page {
for i, name := range names {
p, err := loadPage(name)
if err != nil {
fmt.Printf("Error loading %s\n", name)
log.Printf("Error loading %s: %s", name, err)
} else {
items[i] = p
}
@@ -89,10 +89,8 @@ func search(q string, page int) ([]*Page, bool, int) {
if len(q) == 0 {
return make([]*Page, 0), false, 0
}
index.RLock()
names := searchDocuments(q)
names := index.search(q)
slices.SortFunc(names, sortNames(q))
index.RUnlock()
from := itemsPerPage * (page - 1)
if from > len(names) {
return make([]*Page, 0), false, 0

View File

@@ -7,14 +7,21 @@ import (
"github.com/google/subcommands"
"io"
"os"
"strings"
"bytes"
"slices"
"path/filepath"
"io/fs"
)
type searchCmd struct {
page int
exact bool
}
func (cmd *searchCmd) SetFlags(f *flag.FlagSet) {
f.IntVar(&cmd.page, "page", 1, "the page in the search result set")
f.BoolVar(&cmd.exact, "exact", false, "look for exact matches (do not use the trigram index)")
}
func (*searchCmd) Name() string { return "search" }
@@ -29,15 +36,21 @@ func (*searchCmd) Usage() string {
}
func (cmd *searchCmd) Execute(_ context.Context, f *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
return searchCli(os.Stdout, cmd.page, f.Args())
return searchCli(os.Stdout, cmd.page, cmd.exact, f.Args())
}
// searchCli runs the search command on the command line. It is used
// here with an io.Writer for easy testing.
func searchCli(w io.Writer, n int, args []string) subcommands.ExitStatus {
index.load()
func searchCli(w io.Writer, n int, exact bool, args []string) subcommands.ExitStatus {
var fn func(q string, n int) ([]*Page, bool, int);
if (exact) {
fn = searchExact
} else {
index.load()
fn = search
}
for _, q := range args {
items, more, _ := search(q, n)
items, more, _ := fn(q, n)
if len(items) == 1 {
fmt.Fprintf(w, "Search for %s, page %d: 1 result\n", q, n)
} else {
@@ -52,3 +65,58 @@ func searchCli(w io.Writer, n int, args []string) subcommands.ExitStatus {
}
return subcommands.ExitSuccess
}
// searchExact opens all the files and searches them, one by one.
func searchExact(q string, page int) ([]*Page, bool, int) {
if len(q) == 0 {
return make([]*Page, 0), false, 0
}
terms := bytes.Fields([]byte(q))
pages := make(map[string]*Page)
names := make([]string, 0)
index.titles = make(map[string]string)
err := filepath.Walk(".", func (path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
filename := path
if info.IsDir() || strings.HasPrefix(filename, ".") || !strings.HasSuffix(filename, ".md") {
return nil
}
name := strings.TrimSuffix(filename, ".md")
p, err := loadPage(name)
if err != nil {
return err
}
for _, term := range terms {
if !bytes.Contains(p.Body, term) {
return nil
}
}
p.handleTitle(false)
pages[p.Name] = p
index.titles[p.Name] = p.Title
names = append(names, p.Name)
return nil
})
if err != nil {
return make([]*Page, 0), false, 0
}
slices.SortFunc(names, sortNames(q))
from := itemsPerPage * (page - 1)
if from > len(names) {
return make([]*Page, 0), false, 0
}
to := from + itemsPerPage
if to > len(names) {
to = len(names)
}
items := make([]*Page, 0)
for i := from; i<to; i++ {
p := pages[names[i]]
p.score(q)
p.summarize(q)
items = append(items, p)
}
return items, to < len(names), len(names)/itemsPerPage + 1
}

View File

@@ -9,7 +9,7 @@ import (
func TestSearchCmd(t *testing.T) {
b := new(bytes.Buffer)
s := searchCli(b, 1, []string{"oddµ"})
s := searchCli(b, 1, false, []string{"oddµ"})
assert.Equal(t, subcommands.ExitSuccess, s)
r := `Search for oddµ, page 1: 2 results
* [Oddµ: A minimal wiki](README) (5)

View File

@@ -19,12 +19,14 @@ func TestSearchQuestionmark(t *testing.T) {
_ = os.RemoveAll("testdata")
p := &Page{Name: "testdata/Odd?", Body: []byte(`# Even?
yes or no?`)}
We look at the plants.
They need water. We need us.
The silence streches.`)}
p.save()
data := url.Values{}
data.Set("q", "yes")
data.Set("q", "look")
body := assert.HTTPBody(searchHandler, "GET", "/search", data)
assert.Contains(t, body, "yes or no?")
assert.Contains(t, body, "We look")
assert.NotContains(t, body, "Odd?")
assert.Contains(t, body, "Even?")
}

45
tokenizer.go Normal file
View File

@@ -0,0 +1,45 @@
package main
import (
"strings"
"unicode"
)
// tokenize returns a slice of tokens for the given text.
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
// Split on any character that is not a letter or a
// number, not the hash sign (for hash tags)
return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '#'
})
}
// shortWordFilter removes all the words three characters or less
// except for all caps words like USA, EUR, CHF and the like.
func shortWordFilter(tokens []string) []string {
r := make([]string, 0, len(tokens))
for _, token := range tokens {
if len(token) > 3 ||
len(token) == 3 && token == strings.ToUpper(token) {
r = append(r, token)
}
}
return r
}
// lowercaseFilter returns a slice of lower case tokens.
func lowercaseFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = strings.ToLower(token)
}
return r
}
// tokens returns a slice of tokens.
func tokens(text string) []string {
tokens := tokenize(text)
tokens = shortWordFilter(tokens)
tokens = lowercaseFilter(tokens)
return tokens
}

15
tokenizer_test.go Normal file
View File

@@ -0,0 +1,15 @@
package main
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestTokenizer(t *testing.T) {
assert.EqualValues(t, []string{}, tokens(""), "empty string")
assert.EqualValues(t, []string{}, tokens("the a"), "no short words")
assert.EqualValues(t, []string{"chf"}, tokens("CHF"), "three letter acronyms")
assert.EqualValues(t, []string{}, tokens("CH"), "no two letter acronyms")
assert.EqualValues(t, []string{"franc"}, tokens("Franc"), "lower case")
assert.EqualValues(t, []string{"know", "what"}, tokens("I don't know what to do."))
}

14
wiki.go
View File

@@ -3,7 +3,7 @@ package main
import (
"context"
"flag"
"fmt"
"log"
"github.com/google/subcommands"
"html/template"
"net/http"
@@ -69,12 +69,12 @@ func getPort() string {
// and after. For testing, call index.load directly and skip the
// messages.
func scheduleLoadIndex() {
fmt.Print("Indexing pages\n")
log.Print("Indexing pages")
n, err := index.load()
if err == nil {
fmt.Printf("Indexed %d pages\n", n)
log.Printf("Indexed %d pages", n)
} else {
fmt.Println("Indexing failed")
log.Printf("Indexing failed: %s", err)
}
}
@@ -82,9 +82,9 @@ func scheduleLoadIndex() {
// and after. For testing, call loadLanguages directly and skip the
// messages.
func scheduleLoadLanguages() {
fmt.Print("Loading languages\n")
log.Print("Loading languages")
n := loadLanguages()
fmt.Printf("Loaded %d languages\n", n)
log.Printf("Loaded %d languages", n)
}
func serve() {
@@ -101,7 +101,7 @@ func serve() {
go scheduleLoadLanguages()
initAccounts()
port := getPort()
fmt.Printf("Serving a wiki on port %s\n", port)
log.Printf("Serving a wiki on port %s", port)
http.ListenAndServe(":"+port, nil)
}