Add timing log messages for index and search

2023-09-16 14:58:27 +02:00
11 changed files with 69 additions and 143 deletions
--- a/README.md
+++ b/README.md
@@ -119,13 +119,7 @@ is a byte array and that's why we need to call `printf`).

 For the `search.html` template only:

-`{{.Previous}}`, `{{.Page}}`, `{{.Next}}` and `{{.Last}}` are the
-previous, current, next and last page number in the results since
-doing arithmetics in templates is hard. The first page number is 1.
-
-`{{.More}}` indicates if there are any more search results.
-
-`{{.Results}}` indicates if there were any search results at all.
+`{{.Results}}` indicates if there were any search results.

 `{{.Items}}` is an array of pages, each containing a search result. A
 search result is a page (with the properties seen above). Thus, to
@@ -439,22 +433,6 @@ and "rail", a search for "mail" returns a match because the trigrams
 "mai" and "ail" are found. In this situation, the result has a score
 of 0.

-The sorting of all the pages, however, does not depend on scoring!
-Computing the score is expensive because the page must be loaded from
-disk. Therefore, results are sorted by title:
-
- If the page title contains the query string, it gets sorted first.
- If the page title begins with a number, it is sorted descending.
- All other pages follow, sorted ascending.
-
-The effect is that first, the pages with matches in the page title are
-shown, and then all the others. Within these two groups, the most
-recent blog posts are shown first, if and only if the page title
-begins with an ISO date like 2023-09-16.
-
-The score and highlighting of snippets is used to help visitors decide
-which links to click.
-
 ## Limitations

 Page titles are filenames with `.md` appended. If your filesystem
--- a/commands.go
+++ b/commands.go
@@ -17,14 +17,11 @@ func commands() {
 	} else if len(os.Args) > 2 && os.Args[1] == "search" {
 		index.load()
 		for _, q := range os.Args[2:] {
-			items, more, _ := search(q, 1)
+			items := search(q)
 			fmt.Printf("Search %s: %d results\n", q, len(items))
 			for _, p := range items {
 				fmt.Printf("* %s (%d)\n", p.Title, p.Score)
 			}
-			if more {
-				fmt.Printf("There are more results\n")
-			}
 		}
 	} else {
 		fmt.Printf("Unknown command: %v\n", os.Args[1:])
--- a/concurrency_test.go
+++ b/concurrency_test.go
@@ -10,6 +10,6 @@ func TestLoadAndSearch(t *testing.T) {
 	index.reset()
 	go index.load()
 	q := "Oddµ"
-	pages, _, _ := search(q, 1)
+	pages := search(q)
 	assert.Zero(t, len(pages))
 }
--- a/index.go
+++ b/index.go
@@ -21,9 +21,6 @@ type Index struct {
 	// documents is a map, mapping document ids of the index to
 	// page names.
 	documents map[trigram.DocID]string
-
-	// names is a map, mapping page names to titles.
-	titles map[string]string
 }

 // idx is the global Index per wiki.
@@ -33,7 +30,6 @@ var index Index
 func (idx *Index) reset() {
 	idx.index = nil
 	idx.documents = nil
-	idx.titles = nil
 }

 // add reads a file and adds it to the index. This must happen while
@@ -51,10 +47,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
 	if err != nil {
 		return err
 	}
-	p.handleTitle(false)
 	id := idx.index.Add(strings.ToLower(string(p.Body)))
 	idx.documents[id] = p.Name
-	idx.titles[p.Name] = p.Title
 	return nil
 }

@@ -65,7 +59,6 @@ func (idx *Index) load() (int, error) {
 	defer idx.Unlock()
 	idx.index = make(trigram.Index)
 	idx.documents = make(map[trigram.DocID]string)
-	idx.titles = make(map[string]string)
 	err := filepath.Walk(".", idx.add)
 	if err != nil {
 		idx.reset()
@@ -97,27 +90,23 @@ func (p *Page) updateIndex() {
 		o, err := loadPage(p.Name)
 		if err == nil {
 			index.index.Delete(strings.ToLower(string(o.Body)), id)
-			o.handleTitle(false)
-			delete(index.titles, o.Title)
 		}
 		index.index.Insert(strings.ToLower(string(p.Body)), id)
-		p.handleTitle(false)
-		index.titles[p.Name] = p.Title
 	}
 }

-// searchDocuments searches the index for a string. This requires the
-// index to be locked.
 func searchDocuments(q string) []string {
 	words := strings.Fields(strings.ToLower(q))
 	var trigrams []trigram.T
 	for _, word := range words {
 		trigrams = trigram.Extract(word, trigrams)
 	}
+	index.RLock()
 	ids := index.index.QueryTrigrams(trigrams)
 	names := make([]string, len(ids))
 	for i, id := range ids {
 		names[i] = index.documents[id]
 	}
+	index.RUnlock()
 	return names
 }
--- a/index_test.go
+++ b/index_test.go
@@ -11,19 +11,19 @@ import (
 func TestIndex(t *testing.T) {
 	index.load()
 	q := "Oddµ"
-	pages, _, _ := search(q, 1)
+	pages := search(q)
 	assert.NotZero(t, len(pages))
 	for _, p := range pages {
 		assert.NotContains(t, p.Title, "<b>")
 		assert.True(t, strings.Contains(string(p.Body), q) || strings.Contains(string(p.Title), q))
-		assert.NotZero(t, p.Score, "Score %d for %s", p.Score, p.Name)
+		assert.NotZero(t, p.Score)
 	}
 }

 func TestSearchHashtag(t *testing.T) {
 	index.load()
 	q := "#Another_Tag"
-	pages, _, _ := search(q, 1)
+	pages := search(q)
 	assert.NotZero(t, len(pages))
 }

@@ -35,7 +35,7 @@ func TestIndexUpdates(t *testing.T) {
 	p.save()

 	// Find the phrase
-	pages, _, _ := search("This is a test", 1)
+	pages := search("This is a test")
 	found := false
 	for _, p := range pages {
 		if p.Name == name {
@@ -46,7 +46,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.True(t, found)

 	// Find the phrase, case insensitive
-	pages, _, _ = search("this is a test", 1)
+	pages = search("this is a test")
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -57,7 +57,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.True(t, found)

 	// Find some words
-	pages, _, _ = search("this test", 1)
+	pages = search("this test")
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -70,7 +70,7 @@ func TestIndexUpdates(t *testing.T) {
 	// Update the page and no longer find it with the old phrase
 	p = &Page{Name: name, Body: []byte("Guvf vf n grfg.")}
 	p.save()
-	pages, _, _ = search("This is a test", 1)
+	pages = search("This is a test")
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -81,7 +81,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.False(t, found)

 	// Find page using a new word
-	pages, _, _ = search("Guvf", 1)
+	pages = search("Guvf")
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
--- a/page.go
+++ b/page.go
@@ -186,14 +186,10 @@ func (p *Page) plainText() string {
 	return string(text)
 }

-// score sets Page.Title and computes Page.Score.
-func (p *Page) score(q string) {
+// summarize for query string q sets Page.Html to an extract.
+func (p *Page) summarize(q string) {
 	p.handleTitle(true)
 	p.Score = score(q, string(p.Body)) + score(q, p.Title)
-}
-
-// summarize sets Page.Html to an extract and sets Page.Language.
-func (p *Page) summarize(q string) {
 	t := p.plainText()
 	p.Html = sanitize(snippets(q, t))
 	p.Language = language(t)
--- a/score_test.go
+++ b/score_test.go
@@ -94,7 +94,7 @@ func TestScorePageAndMarkup(t *testing.T) {
 	s := `The Transjovian Council accepts new members. If you think we'd be a good fit, apply for an account. Contact [Alex Schroeder](https://alexschroeder.ch/wiki/Contact). Mail is best. Encrypted mail is best. [Delta Chat](https://delta.chat/de/) is a messenger app that uses encrypted mail. It's the bestest best.`
 	p := &Page{Title: "Test", Name: "Test", Body: []byte(s)}
 	q := "wiki"
-	p.score(q)
+	p.summarize(q)
 	// "wiki" is not visible in the plain text but the score is no affected:
 	// - wiki, all, whole, beginning, end (5)
 	if p.Score != 5 {
--- a/search.go
+++ b/search.go
@@ -4,8 +4,7 @@ import (
 	"fmt"
 	"net/http"
 	"slices"
-	"strconv"
-	"strings"
+	"time"
 	"unicode"
 	"unicode/utf8"
 )
@@ -16,97 +15,73 @@ import (
 // a search result, Body and Html are simple extracts.
 type Search struct {
 	Query   string
-	Items   []*Page
-	Previous int
-	Page    int
-	Next    int
-	Last    int
-	More    bool
+	Items   []Page
 	Results bool
 }

-// sortNames returns a sort function that sorts in three stages: 1.
-// whether the query string matches the page title; 2. descending if
-// the page titles start with a digit; 3. otherwise ascending.
-// Access to the index requires a read lock!
-func sortNames(q string) func (a, b string) int {
-	return func (a, b string) int {
-		// If only one page contains the query string, it
-		// takes precedence.
-		ia := strings.Contains(index.titles[a], q)
-		ib := strings.Contains(index.titles[b], q)
-		if (ia && !ib) {
-			return -1
-		} else if (!ia && ib) {
+func sortItems(a, b Page) int {
+	// Sort by score
+	if a.Score < b.Score {
+		return 1
+	} else if a.Score > b.Score {
+		return -1
+	}
+	// If the score is the same and both page names start
+	// with a number (like an ISO date), sort descending.
+	ra, _ := utf8.DecodeRuneInString(a.Title)
+	rb, _ := utf8.DecodeRuneInString(b.Title)
+	if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
+		if a.Title < b.Title {
 			return 1
-		}
-		// If both page names start with a number (like an ISO date),
-		// sort descending.
-		ra, _ := utf8.DecodeRuneInString(a)
-		rb, _ := utf8.DecodeRuneInString(b)
-		if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
-			if a < b {
-				return 1
-			} else if a > b {
-				return -1
-			} else {
-				return 0
-			}
-		}
-		// Otherwise sort ascending.
-		if a < b {
+		} else if a.Title > b.Title {
 			return -1
-		} else if a > b {
-			return 1
 		} else {
 			return 0
 		}
 	}
+	// Otherwise sort ascending.
+	if a.Title < b.Title {
+		return -1
+	} else if a.Title > b.Title {
+		return 1
+	} else {
+		return 0
+	}
 }

-// load the pages named.
-func load(names []string) []*Page {
-	items := make([]*Page, len(names))
+// loadAndSummarize loads the pages named and summarizes them for the
+// query give.
+func loadAndSummarize(names []string, q string) []Page {
+	// Load and summarize the items.
+	items := make([]Page, len(names))
 	for i, name := range names {
 		p, err := loadPage(name)
 		if err != nil {
 			fmt.Printf("Error loading %s\n", name)
 		} else {
-			items[i] = p
+			p.summarize(q)
+			items[i] = *p
 		}
 	}
 	return items
 }

-// itemsPerPage says how many items to print on a page of search
-// results.
-const itemsPerPage = 20
-
 // search returns a sorted []Page where each page contains an extract
-// of the actual Page.Body in its Page.Html. Page size is 20. The
-// boolean return value indicates whether there are more results.
-func search(q string, page int) ([]*Page, bool, int) {
+// of the actual Page.Body in its Page.Html.
+func search(q string) []Page {
 	if len(q) == 0 {
-		return make([]*Page, 0), false, 0
+		return make([]Page, 0)
 	}
-	index.RLock()
+	start := time.Now()
 	names := searchDocuments(q)
-	slices.SortFunc(names, sortNames(q))
-	index.RUnlock()
-	from := itemsPerPage*(page-1)
-	if from > len(names) {
-		return make([]*Page, 0), false, 0
-	}
-	to := from + itemsPerPage
-	if to > len(names) {
-		to = len(names)
-	}
-	items := load(names[from:to])
-	for _, p := range items {
-		p.score(q)
-		p.summarize(q)
-	}
-	return items, to < len(names), len(names)/itemsPerPage+1
+	fmt.Printf("Search for %v found %d pages in %v\n", q, len(names), time.Since(start))
+	start = time.Now()
+	items := loadAndSummarize(names, q)
+	fmt.Printf("Loading and summarizing %d pages took %v\n", len(names), time.Since(start))
+	start = time.Now()
+	slices.SortFunc(items, sortItems)
+	fmt.Printf("Sorting %d pages took %v\n", len(names), time.Since(start))
+	return items
 }

 // searchHandler presents a search result. It uses the query string in
@@ -114,12 +89,7 @@ func search(q string, page int) ([]*Page, bool, int) {
 // page found, the HTML is just an extract of the actual body.
 func searchHandler(w http.ResponseWriter, r *http.Request) {
 	q := r.FormValue("q")
-	page, err := strconv.Atoi(r.FormValue("page"))
-	if err != nil {
-		page = 1
-	}
-	items, more, last := search(q, page)
-	s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Last: last,
-		Results: len(items) > 0, More: more}
+	items := search(q)
+	s := &Search{Query: q, Items: items, Results: len(items) > 0}
 	renderTemplate(w, "search", s)
 }
--- a/search.html
+++ b/search.html
@@ -23,19 +23,13 @@ img { max-width: 20%; }
      <a href="/view/index">Home</a>
      <form role="search" action="/search" method="GET">
        <label for="search">Search:</label>
-        <input id="search" type="text" value="{{.Query}}" spellcheck="false" name="q" accesskey="f" required>
+        <input id="search" type="text" value="{{.Query}}" spellcheck="false" name="q" required>
        <button>Go</button>
      </form>
    </header>
    <main id="main">
      <h1>Search for {{.Query}}</h1>
      {{if .Results}}
-      <p>
-        {{if gt .Page 2}}<a href="/search?q={{.Query}}&page=1">First</a>{{end}}
-        {{if gt .Page 1}}<a href="/search?q={{.Query}}&page={{.Previous}}">Previous</a>{{end}}
-        Page {{.Page}}
-        {{if .More}}<a href="/search?q={{.Query}}&page={{.Next}}">Next</a>{{end}}
-        {{if lt .Next .Last}}<a href="/search?q={{.Query}}&page={{.Last}}">Last</a>{{end}}
      {{range .Items}}
      <article lang="{{.Language}}">
        <p><a class="result" href="/view/{{.Name}}">{{.Title}}</a>
--- a/view.html
+++ b/view.html
@@ -20,12 +20,12 @@ img { max-width: 100%; }
    <header>
      <a href="#main">Skip navigation</a>
      <a href="/view/index">Home</a>
-      <a href="/edit/{{.Name}}" accesskey="e">Edit</a>
-      <a href="/add/{{.Name}}" accesskey="a">Add</a>
-      <a href="/upload/{{.Dir}}" accesskey="u">Upload</a>
+      <a href="/edit/{{.Name}}">Edit</a>
+      <a href="/add/{{.Name}}">Add</a>
+      <a href="/upload/{{.Dir}}">Upload</a>
      <form role="search" action="/search" method="GET">
        <label for="search">Search:</label>
-        <input id="search" type="text" spellcheck="false" name="q" accesskey="f" required>
+        <input id="search" type="text" spellcheck="false" name="q" required>
        <button>Go</button>
      </form>
    </header>
--- a/wiki.go
+++ b/wiki.go
@@ -6,6 +6,7 @@ import (
 	"net/http"
 	"os"
 	"regexp"
+	"time"
 )

 // Templates are parsed at startup.
@@ -67,9 +68,10 @@ func getPort() string {
 // messages.
 func scheduleLoadIndex() {
 	fmt.Print("Indexing pages\n")
+	start := time.Now()
 	n, err := index.load()
 	if err == nil {
-		fmt.Printf("Indexed %d pages\n", n)
+		fmt.Printf("Indexed %d pages in %v\n", n, time.Since(start))
 	} else {
 		fmt.Println("Indexing failed")
 	}