1 Commits
v1.0 ... timing

Author SHA1 Message Date
Alex Schroeder
b8429f58ef Add timing log messages for index and search 2023-09-16 14:58:27 +02:00
11 changed files with 69 additions and 143 deletions

View File

@@ -119,13 +119,7 @@ is a byte array and that's why we need to call `printf`).
For the `search.html` template only:
`{{.Previous}}`, `{{.Page}}`, `{{.Next}}` and `{{.Last}}` are the
previous, current, next and last page number in the results since
doing arithmetics in templates is hard. The first page number is 1.
`{{.More}}` indicates if there are any more search results.
`{{.Results}}` indicates if there were any search results at all.
`{{.Results}}` indicates if there were any search results.
`{{.Items}}` is an array of pages, each containing a search result. A
search result is a page (with the properties seen above). Thus, to
@@ -439,22 +433,6 @@ and "rail", a search for "mail" returns a match because the trigrams
"mai" and "ail" are found. In this situation, the result has a score
of 0.
The sorting of all the pages, however, does not depend on scoring!
Computing the score is expensive because the page must be loaded from
disk. Therefore, results are sorted by title:
- If the page title contains the query string, it gets sorted first.
- If the page title begins with a number, it is sorted descending.
- All other pages follow, sorted ascending.
The effect is that first, the pages with matches in the page title are
shown, and then all the others. Within these two groups, the most
recent blog posts are shown first, if and only if the page title
begins with an ISO date like 2023-09-16.
The score and highlighting of snippets is used to help visitors decide
which links to click.
## Limitations
Page titles are filenames with `.md` appended. If your filesystem

View File

@@ -17,14 +17,11 @@ func commands() {
} else if len(os.Args) > 2 && os.Args[1] == "search" {
index.load()
for _, q := range os.Args[2:] {
items, more, _ := search(q, 1)
items := search(q)
fmt.Printf("Search %s: %d results\n", q, len(items))
for _, p := range items {
fmt.Printf("* %s (%d)\n", p.Title, p.Score)
}
if more {
fmt.Printf("There are more results\n")
}
}
} else {
fmt.Printf("Unknown command: %v\n", os.Args[1:])

View File

@@ -10,6 +10,6 @@ func TestLoadAndSearch(t *testing.T) {
index.reset()
go index.load()
q := "Oddµ"
pages, _, _ := search(q, 1)
pages := search(q)
assert.Zero(t, len(pages))
}

View File

@@ -21,9 +21,6 @@ type Index struct {
// documents is a map, mapping document ids of the index to
// page names.
documents map[trigram.DocID]string
// names is a map, mapping page names to titles.
titles map[string]string
}
// idx is the global Index per wiki.
@@ -33,7 +30,6 @@ var index Index
func (idx *Index) reset() {
idx.index = nil
idx.documents = nil
idx.titles = nil
}
// add reads a file and adds it to the index. This must happen while
@@ -51,10 +47,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
p.handleTitle(false)
id := idx.index.Add(strings.ToLower(string(p.Body)))
idx.documents[id] = p.Name
idx.titles[p.Name] = p.Title
return nil
}
@@ -65,7 +59,6 @@ func (idx *Index) load() (int, error) {
defer idx.Unlock()
idx.index = make(trigram.Index)
idx.documents = make(map[trigram.DocID]string)
idx.titles = make(map[string]string)
err := filepath.Walk(".", idx.add)
if err != nil {
idx.reset()
@@ -97,27 +90,23 @@ func (p *Page) updateIndex() {
o, err := loadPage(p.Name)
if err == nil {
index.index.Delete(strings.ToLower(string(o.Body)), id)
o.handleTitle(false)
delete(index.titles, o.Title)
}
index.index.Insert(strings.ToLower(string(p.Body)), id)
p.handleTitle(false)
index.titles[p.Name] = p.Title
}
}
// searchDocuments searches the index for a string. This requires the
// index to be locked.
func searchDocuments(q string) []string {
words := strings.Fields(strings.ToLower(q))
var trigrams []trigram.T
for _, word := range words {
trigrams = trigram.Extract(word, trigrams)
}
index.RLock()
ids := index.index.QueryTrigrams(trigrams)
names := make([]string, len(ids))
for i, id := range ids {
names[i] = index.documents[id]
}
index.RUnlock()
return names
}

View File

@@ -11,19 +11,19 @@ import (
func TestIndex(t *testing.T) {
index.load()
q := "Oddµ"
pages, _, _ := search(q, 1)
pages := search(q)
assert.NotZero(t, len(pages))
for _, p := range pages {
assert.NotContains(t, p.Title, "<b>")
assert.True(t, strings.Contains(string(p.Body), q) || strings.Contains(string(p.Title), q))
assert.NotZero(t, p.Score, "Score %d for %s", p.Score, p.Name)
assert.NotZero(t, p.Score)
}
}
func TestSearchHashtag(t *testing.T) {
index.load()
q := "#Another_Tag"
pages, _, _ := search(q, 1)
pages := search(q)
assert.NotZero(t, len(pages))
}
@@ -35,7 +35,7 @@ func TestIndexUpdates(t *testing.T) {
p.save()
// Find the phrase
pages, _, _ := search("This is a test", 1)
pages := search("This is a test")
found := false
for _, p := range pages {
if p.Name == name {
@@ -46,7 +46,7 @@ func TestIndexUpdates(t *testing.T) {
assert.True(t, found)
// Find the phrase, case insensitive
pages, _, _ = search("this is a test", 1)
pages = search("this is a test")
found = false
for _, p := range pages {
if p.Name == name {
@@ -57,7 +57,7 @@ func TestIndexUpdates(t *testing.T) {
assert.True(t, found)
// Find some words
pages, _, _ = search("this test", 1)
pages = search("this test")
found = false
for _, p := range pages {
if p.Name == name {
@@ -70,7 +70,7 @@ func TestIndexUpdates(t *testing.T) {
// Update the page and no longer find it with the old phrase
p = &Page{Name: name, Body: []byte("Guvf vf n grfg.")}
p.save()
pages, _, _ = search("This is a test", 1)
pages = search("This is a test")
found = false
for _, p := range pages {
if p.Name == name {
@@ -81,7 +81,7 @@ func TestIndexUpdates(t *testing.T) {
assert.False(t, found)
// Find page using a new word
pages, _, _ = search("Guvf", 1)
pages = search("Guvf")
found = false
for _, p := range pages {
if p.Name == name {

View File

@@ -186,14 +186,10 @@ func (p *Page) plainText() string {
return string(text)
}
// score sets Page.Title and computes Page.Score.
func (p *Page) score(q string) {
// summarize for query string q sets Page.Html to an extract.
func (p *Page) summarize(q string) {
p.handleTitle(true)
p.Score = score(q, string(p.Body)) + score(q, p.Title)
}
// summarize sets Page.Html to an extract and sets Page.Language.
func (p *Page) summarize(q string) {
t := p.plainText()
p.Html = sanitize(snippets(q, t))
p.Language = language(t)

View File

@@ -94,7 +94,7 @@ func TestScorePageAndMarkup(t *testing.T) {
s := `The Transjovian Council accepts new members. If you think we'd be a good fit, apply for an account. Contact [Alex Schroeder](https://alexschroeder.ch/wiki/Contact). Mail is best. Encrypted mail is best. [Delta Chat](https://delta.chat/de/) is a messenger app that uses encrypted mail. It's the bestest best.`
p := &Page{Title: "Test", Name: "Test", Body: []byte(s)}
q := "wiki"
p.score(q)
p.summarize(q)
// "wiki" is not visible in the plain text but the score is no affected:
// - wiki, all, whole, beginning, end (5)
if p.Score != 5 {

120
search.go
View File

@@ -4,8 +4,7 @@ import (
"fmt"
"net/http"
"slices"
"strconv"
"strings"
"time"
"unicode"
"unicode/utf8"
)
@@ -16,97 +15,73 @@ import (
// a search result, Body and Html are simple extracts.
type Search struct {
Query string
Items []*Page
Previous int
Page int
Next int
Last int
More bool
Items []Page
Results bool
}
// sortNames returns a sort function that sorts in three stages: 1.
// whether the query string matches the page title; 2. descending if
// the page titles start with a digit; 3. otherwise ascending.
// Access to the index requires a read lock!
func sortNames(q string) func (a, b string) int {
return func (a, b string) int {
// If only one page contains the query string, it
// takes precedence.
ia := strings.Contains(index.titles[a], q)
ib := strings.Contains(index.titles[b], q)
if (ia && !ib) {
return -1
} else if (!ia && ib) {
func sortItems(a, b Page) int {
// Sort by score
if a.Score < b.Score {
return 1
} else if a.Score > b.Score {
return -1
}
// If the score is the same and both page names start
// with a number (like an ISO date), sort descending.
ra, _ := utf8.DecodeRuneInString(a.Title)
rb, _ := utf8.DecodeRuneInString(b.Title)
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
if a.Title < b.Title {
return 1
}
// If both page names start with a number (like an ISO date),
// sort descending.
ra, _ := utf8.DecodeRuneInString(a)
rb, _ := utf8.DecodeRuneInString(b)
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
if a < b {
return 1
} else if a > b {
return -1
} else {
return 0
}
}
// Otherwise sort ascending.
if a < b {
} else if a.Title > b.Title {
return -1
} else if a > b {
return 1
} else {
return 0
}
}
// Otherwise sort ascending.
if a.Title < b.Title {
return -1
} else if a.Title > b.Title {
return 1
} else {
return 0
}
}
// load the pages named.
func load(names []string) []*Page {
items := make([]*Page, len(names))
// loadAndSummarize loads the pages named and summarizes them for the
// query give.
func loadAndSummarize(names []string, q string) []Page {
// Load and summarize the items.
items := make([]Page, len(names))
for i, name := range names {
p, err := loadPage(name)
if err != nil {
fmt.Printf("Error loading %s\n", name)
} else {
items[i] = p
p.summarize(q)
items[i] = *p
}
}
return items
}
// itemsPerPage says how many items to print on a page of search
// results.
const itemsPerPage = 20
// search returns a sorted []Page where each page contains an extract
// of the actual Page.Body in its Page.Html. Page size is 20. The
// boolean return value indicates whether there are more results.
func search(q string, page int) ([]*Page, bool, int) {
// of the actual Page.Body in its Page.Html.
func search(q string) []Page {
if len(q) == 0 {
return make([]*Page, 0), false, 0
return make([]Page, 0)
}
index.RLock()
start := time.Now()
names := searchDocuments(q)
slices.SortFunc(names, sortNames(q))
index.RUnlock()
from := itemsPerPage*(page-1)
if from > len(names) {
return make([]*Page, 0), false, 0
}
to := from + itemsPerPage
if to > len(names) {
to = len(names)
}
items := load(names[from:to])
for _, p := range items {
p.score(q)
p.summarize(q)
}
return items, to < len(names), len(names)/itemsPerPage+1
fmt.Printf("Search for %v found %d pages in %v\n", q, len(names), time.Since(start))
start = time.Now()
items := loadAndSummarize(names, q)
fmt.Printf("Loading and summarizing %d pages took %v\n", len(names), time.Since(start))
start = time.Now()
slices.SortFunc(items, sortItems)
fmt.Printf("Sorting %d pages took %v\n", len(names), time.Since(start))
return items
}
// searchHandler presents a search result. It uses the query string in
@@ -114,12 +89,7 @@ func search(q string, page int) ([]*Page, bool, int) {
// page found, the HTML is just an extract of the actual body.
func searchHandler(w http.ResponseWriter, r *http.Request) {
q := r.FormValue("q")
page, err := strconv.Atoi(r.FormValue("page"))
if err != nil {
page = 1
}
items, more, last := search(q, page)
s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Last: last,
Results: len(items) > 0, More: more}
items := search(q)
s := &Search{Query: q, Items: items, Results: len(items) > 0}
renderTemplate(w, "search", s)
}

View File

@@ -23,19 +23,13 @@ img { max-width: 20%; }
<a href="/view/index">Home</a>
<form role="search" action="/search" method="GET">
<label for="search">Search:</label>
<input id="search" type="text" value="{{.Query}}" spellcheck="false" name="q" accesskey="f" required>
<input id="search" type="text" value="{{.Query}}" spellcheck="false" name="q" required>
<button>Go</button>
</form>
</header>
<main id="main">
<h1>Search for {{.Query}}</h1>
{{if .Results}}
<p>
{{if gt .Page 2}}<a href="/search?q={{.Query}}&page=1">First</a>{{end}}
{{if gt .Page 1}}<a href="/search?q={{.Query}}&page={{.Previous}}">Previous</a>{{end}}
Page {{.Page}}
{{if .More}}<a href="/search?q={{.Query}}&page={{.Next}}">Next</a>{{end}}
{{if lt .Next .Last}}<a href="/search?q={{.Query}}&page={{.Last}}">Last</a>{{end}}
{{range .Items}}
<article lang="{{.Language}}">
<p><a class="result" href="/view/{{.Name}}">{{.Title}}</a>

View File

@@ -20,12 +20,12 @@ img { max-width: 100%; }
<header>
<a href="#main">Skip navigation</a>
<a href="/view/index">Home</a>
<a href="/edit/{{.Name}}" accesskey="e">Edit</a>
<a href="/add/{{.Name}}" accesskey="a">Add</a>
<a href="/upload/{{.Dir}}" accesskey="u">Upload</a>
<a href="/edit/{{.Name}}">Edit</a>
<a href="/add/{{.Name}}">Add</a>
<a href="/upload/{{.Dir}}">Upload</a>
<form role="search" action="/search" method="GET">
<label for="search">Search:</label>
<input id="search" type="text" spellcheck="false" name="q" accesskey="f" required>
<input id="search" type="text" spellcheck="false" name="q" required>
<button>Go</button>
</form>
</header>

View File

@@ -6,6 +6,7 @@ import (
"net/http"
"os"
"regexp"
"time"
)
// Templates are parsed at startup.
@@ -67,9 +68,10 @@ func getPort() string {
// messages.
func scheduleLoadIndex() {
fmt.Print("Indexing pages\n")
start := time.Now()
n, err := index.load()
if err == nil {
fmt.Printf("Indexed %d pages\n", n)
fmt.Printf("Indexed %d pages in %v\n", n, time.Since(start))
} else {
fmt.Println("Indexing failed")
}