1 Commits

Author SHA1 Message Date
Alex Schroeder
153a179d92 Search results are based on page titles only 2023-09-16 23:36:17 +02:00
7 changed files with 95 additions and 54 deletions

View File

@@ -119,15 +119,13 @@ is a byte array and that's why we need to call `printf`).
For the `search.html` template only:
`{{.Page}}` is the page number in the results.
`{{.Previous}}` and `{{.Next}} are the previous and next page number
in the results since doing arithmetics in templates is hard. The first
page number is 1.
`{{.Previous}}`, `{{.Page}}`, `{{.Next}}` and `{{.Last}}` are the
previous, current, next and last page number in the results since
doing arithmetics in templates is hard. The first page number is 1.
`{{.More}}` indicates if there are any more search results.
`{{.Results}}` indicates if there were any search results.
`{{.Results}}` indicates if there were any search results at all.
`{{.Items}}` is an array of pages, each containing a search result. A
search result is a page (with the properties seen above). Thus, to
@@ -441,6 +439,22 @@ and "rail", a search for "mail" returns a match because the trigrams
"mai" and "ail" are found. In this situation, the result has a score
of 0.
The sorting of all the pages, however, does not depend on scoring!
Computing the score is expensive because the page must be loaded from
disk. Therefore, results are sorted by title:
- If the page title contains the query string, it gets sorted first.
- If the page title begins with a number, it is sorted descending.
- All other pages follow, sorted ascending.
The effect is that first, the pages with matches in the page title are
shown, and then all the others. Within these two groups, the most
recent blog posts are shown first, if and only if the page title
begins with an ISO date like 2023-09-16.
The score and highlighting of snippets is used to help visitors decide
which links to click.
## Limitations
Page titles are filenames with `.md` appended. If your filesystem

View File

@@ -17,7 +17,7 @@ func commands() {
} else if len(os.Args) > 2 && os.Args[1] == "search" {
index.load()
for _, q := range os.Args[2:] {
items, more := search(q, 1)
items, more, _ := search(q, 1)
fmt.Printf("Search %s: %d results\n", q, len(items))
for _, p := range items {
fmt.Printf("* %s (%d)\n", p.Title, p.Score)

View File

@@ -10,6 +10,6 @@ func TestLoadAndSearch(t *testing.T) {
index.reset()
go index.load()
q := "Oddµ"
pages, _ := search(q, 1)
pages, _, _ := search(q, 1)
assert.Zero(t, len(pages))
}

View File

@@ -21,6 +21,9 @@ type Index struct {
// documents is a map, mapping document ids of the index to
// page names.
documents map[trigram.DocID]string
// names is a map, mapping page names to titles.
titles map[string]string
}
// idx is the global Index per wiki.
@@ -30,6 +33,7 @@ var index Index
func (idx *Index) reset() {
idx.index = nil
idx.documents = nil
idx.titles = nil
}
// add reads a file and adds it to the index. This must happen while
@@ -47,8 +51,10 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
p.handleTitle(false)
id := idx.index.Add(strings.ToLower(string(p.Body)))
idx.documents[id] = p.Name
idx.titles[p.Name] = p.Title
return nil
}
@@ -59,6 +65,7 @@ func (idx *Index) load() (int, error) {
defer idx.Unlock()
idx.index = make(trigram.Index)
idx.documents = make(map[trigram.DocID]string)
idx.titles = make(map[string]string)
err := filepath.Walk(".", idx.add)
if err != nil {
idx.reset()
@@ -90,23 +97,27 @@ func (p *Page) updateIndex() {
o, err := loadPage(p.Name)
if err == nil {
index.index.Delete(strings.ToLower(string(o.Body)), id)
o.handleTitle(false)
delete(index.titles, o.Title)
}
index.index.Insert(strings.ToLower(string(p.Body)), id)
p.handleTitle(false)
index.titles[p.Name] = p.Title
}
}
// searchDocuments searches the index for a string. This requires the
// index to be locked.
func searchDocuments(q string) []string {
words := strings.Fields(strings.ToLower(q))
var trigrams []trigram.T
for _, word := range words {
trigrams = trigram.Extract(word, trigrams)
}
index.RLock()
ids := index.index.QueryTrigrams(trigrams)
names := make([]string, len(ids))
for i, id := range ids {
names[i] = index.documents[id]
}
index.RUnlock()
return names
}

View File

@@ -11,7 +11,7 @@ import (
func TestIndex(t *testing.T) {
index.load()
q := "Oddµ"
pages, _ := search(q, 1)
pages, _, _ := search(q, 1)
assert.NotZero(t, len(pages))
for _, p := range pages {
assert.NotContains(t, p.Title, "<b>")
@@ -23,7 +23,7 @@ func TestIndex(t *testing.T) {
func TestSearchHashtag(t *testing.T) {
index.load()
q := "#Another_Tag"
pages, _ := search(q, 1)
pages, _, _ := search(q, 1)
assert.NotZero(t, len(pages))
}
@@ -35,7 +35,7 @@ func TestIndexUpdates(t *testing.T) {
p.save()
// Find the phrase
pages, _ := search("This is a test", 1)
pages, _, _ := search("This is a test", 1)
found := false
for _, p := range pages {
if p.Name == name {
@@ -46,7 +46,7 @@ func TestIndexUpdates(t *testing.T) {
assert.True(t, found)
// Find the phrase, case insensitive
pages, _ = search("this is a test", 1)
pages, _, _ = search("this is a test", 1)
found = false
for _, p := range pages {
if p.Name == name {
@@ -57,7 +57,7 @@ func TestIndexUpdates(t *testing.T) {
assert.True(t, found)
// Find some words
pages, _ = search("this test", 1)
pages, _, _ = search("this test", 1)
found = false
for _, p := range pages {
if p.Name == name {
@@ -70,7 +70,7 @@ func TestIndexUpdates(t *testing.T) {
// Update the page and no longer find it with the old phrase
p = &Page{Name: name, Body: []byte("Guvf vf n grfg.")}
p.save()
pages, _ = search("This is a test", 1)
pages, _, _ = search("This is a test", 1)
found = false
for _, p := range pages {
if p.Name == name {
@@ -81,7 +81,7 @@ func TestIndexUpdates(t *testing.T) {
assert.False(t, found)
// Find page using a new word
pages, _ = search("Guvf", 1)
pages, _, _ = search("Guvf", 1)
found = false
for _, p := range pages {
if p.Name == name {

View File

@@ -5,6 +5,7 @@ import (
"net/http"
"slices"
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
@@ -19,38 +20,48 @@ type Search struct {
Previous int
Page int
Next int
Last int
More bool
Results bool
}
func sortItems(a, b *Page) int {
// Sort by score
if a.Score < b.Score {
return 1
} else if a.Score > b.Score {
return -1
}
// If the score is the same and both page names start
// with a number (like an ISO date), sort descending.
ra, _ := utf8.DecodeRuneInString(a.Title)
rb, _ := utf8.DecodeRuneInString(b.Title)
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
if a.Title < b.Title {
return 1
} else if a.Title > b.Title {
// sortNames returns a sort function that sorts in three stages: 1.
// whether the query string matches the page title; 2. descending if
// the page titles start with a digit; 3. otherwise ascending.
// Access to the index requires a read lock!
func sortNames(q string) func (a, b string) int {
return func (a, b string) int {
// If only one page contains the query string, it
// takes precedence.
ia := strings.Contains(index.titles[a], q)
ib := strings.Contains(index.titles[b], q)
if (ia && !ib) {
return -1
} else if (!ia && ib) {
return 1
}
// If both page names start with a number (like an ISO date),
// sort descending.
ra, _ := utf8.DecodeRuneInString(a)
rb, _ := utf8.DecodeRuneInString(b)
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
if a < b {
return 1
} else if a > b {
return -1
} else {
return 0
}
}
// Otherwise sort ascending.
if a < b {
return -1
} else if a > b {
return 1
} else {
return 0
}
}
// Otherwise sort ascending.
if a.Title < b.Title {
return -1
} else if a.Title > b.Title {
return 1
} else {
return 0
}
}
// load the pages named.
@@ -67,32 +78,35 @@ func load(names []string) []*Page {
return items
}
// itemsPerPage says how many items to print on a page of search
// results.
const itemsPerPage = 20
// search returns a sorted []Page where each page contains an extract
// of the actual Page.Body in its Page.Html. Page size is 20. The
// boolean return value indicates whether there are more results.
func search(q string, page int) ([]*Page, bool) {
func search(q string, page int) ([]*Page, bool, int) {
if len(q) == 0 {
return make([]*Page, 0), false
return make([]*Page, 0), false, 0
}
index.RLock()
names := searchDocuments(q)
items := load(names)
for _, p := range items {
p.score(q)
}
slices.SortFunc(items, sortItems)
from := 20*(page-1)
slices.SortFunc(names, sortNames(q))
index.RUnlock()
from := itemsPerPage*(page-1)
if from > len(names) {
return make([]*Page, 0), false
return make([]*Page, 0), false, 0
}
to := from + 20
to := from + itemsPerPage
if to > len(names) {
to = len(names)
}
items = items[from:to]
items := load(names[from:to])
for _, p := range items {
p.score(q)
p.summarize(q)
}
return items, to < len(names)
return items, to < len(names), len(names)/itemsPerPage+1
}
// searchHandler presents a search result. It uses the query string in
@@ -104,7 +118,8 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
if err != nil {
page = 1
}
items, more := search(q, page)
s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Results: len(items) > 0, More: more}
items, more, last := search(q, page)
s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Last: last,
Results: len(items) > 0, More: more}
renderTemplate(w, "search", s)
}

View File

@@ -35,6 +35,7 @@ img { max-width: 20%; }
{{if gt .Page 1}}<a href="/search?q={{.Query}}&page={{.Previous}}">Previous</a>{{end}}
Page {{.Page}}
{{if .More}}<a href="/search?q={{.Query}}&page={{.Next}}">Next</a>{{end}}
{{if lt .Next .Last}}<a href="/search?q={{.Query}}&page={{.Last}}">Last</a>{{end}}
{{range .Items}}
<article lang="{{.Language}}">
<p><a class="result" href="/view/{{.Name}}">{{.Title}}</a>