Search results are based on page titles only

2023-09-16 23:36:17 +02:00
7 changed files with 95 additions and 54 deletions
--- a/README.md
+++ b/README.md
@@ -119,15 +119,13 @@ is a byte array and that's why we need to call `printf`).

 For the `search.html` template only:

-`{{.Page}}` is the page number in the results.
-
-`{{.Previous}}` and `{{.Next}} are the previous and next page number
-in the results since doing arithmetics in templates is hard. The first
-page number is 1.
+`{{.Previous}}`, `{{.Page}}`, `{{.Next}}` and `{{.Last}}` are the
+previous, current, next and last page number in the results since
+doing arithmetics in templates is hard. The first page number is 1.

 `{{.More}}` indicates if there are any more search results.

-`{{.Results}}` indicates if there were any search results.
+`{{.Results}}` indicates if there were any search results at all.

 `{{.Items}}` is an array of pages, each containing a search result. A
 search result is a page (with the properties seen above). Thus, to
@@ -441,6 +439,22 @@ and "rail", a search for "mail" returns a match because the trigrams
 "mai" and "ail" are found. In this situation, the result has a score
 of 0.

+The sorting of all the pages, however, does not depend on scoring!
+Computing the score is expensive because the page must be loaded from
+disk. Therefore, results are sorted by title:
+
+- If the page title contains the query string, it gets sorted first.
+- If the page title begins with a number, it is sorted descending.
+- All other pages follow, sorted ascending.
+
+The effect is that first, the pages with matches in the page title are
+shown, and then all the others. Within these two groups, the most
+recent blog posts are shown first, if and only if the page title
+begins with an ISO date like 2023-09-16.
+
+The score and highlighting of snippets is used to help visitors decide
+which links to click.
+
 ## Limitations

 Page titles are filenames with `.md` appended. If your filesystem
--- a/commands.go
+++ b/commands.go
@@ -17,7 +17,7 @@ func commands() {
 	} else if len(os.Args) > 2 && os.Args[1] == "search" {
 		index.load()
 		for _, q := range os.Args[2:] {
-			items, more := search(q, 1)
+			items, more, _ := search(q, 1)
 			fmt.Printf("Search %s: %d results\n", q, len(items))
 			for _, p := range items {
 				fmt.Printf("* %s (%d)\n", p.Title, p.Score)
--- a/concurrency_test.go
+++ b/concurrency_test.go
@@ -10,6 +10,6 @@ func TestLoadAndSearch(t *testing.T) {
 	index.reset()
 	go index.load()
 	q := "Oddµ"
-	pages, _ := search(q, 1)
+	pages, _, _ := search(q, 1)
 	assert.Zero(t, len(pages))
 }
--- a/index.go
+++ b/index.go
@@ -21,6 +21,9 @@ type Index struct {
 	// documents is a map, mapping document ids of the index to
 	// page names.
 	documents map[trigram.DocID]string
+
+	// names is a map, mapping page names to titles.
+	titles map[string]string
 }

 // idx is the global Index per wiki.
@@ -30,6 +33,7 @@ var index Index
 func (idx *Index) reset() {
 	idx.index = nil
 	idx.documents = nil
+	idx.titles = nil
 }

 // add reads a file and adds it to the index. This must happen while
@@ -47,8 +51,10 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
 	if err != nil {
 		return err
 	}
+	p.handleTitle(false)
 	id := idx.index.Add(strings.ToLower(string(p.Body)))
 	idx.documents[id] = p.Name
+	idx.titles[p.Name] = p.Title
 	return nil
 }

@@ -59,6 +65,7 @@ func (idx *Index) load() (int, error) {
 	defer idx.Unlock()
 	idx.index = make(trigram.Index)
 	idx.documents = make(map[trigram.DocID]string)
+	idx.titles = make(map[string]string)
 	err := filepath.Walk(".", idx.add)
 	if err != nil {
 		idx.reset()
@@ -90,23 +97,27 @@ func (p *Page) updateIndex() {
 		o, err := loadPage(p.Name)
 		if err == nil {
 			index.index.Delete(strings.ToLower(string(o.Body)), id)
+			o.handleTitle(false)
+			delete(index.titles, o.Title)
 		}
 		index.index.Insert(strings.ToLower(string(p.Body)), id)
+		p.handleTitle(false)
+		index.titles[p.Name] = p.Title
 	}
 }

+// searchDocuments searches the index for a string. This requires the
+// index to be locked.
 func searchDocuments(q string) []string {
 	words := strings.Fields(strings.ToLower(q))
 	var trigrams []trigram.T
 	for _, word := range words {
 		trigrams = trigram.Extract(word, trigrams)
 	}
-	index.RLock()
 	ids := index.index.QueryTrigrams(trigrams)
 	names := make([]string, len(ids))
 	for i, id := range ids {
 		names[i] = index.documents[id]
 	}
-	index.RUnlock()
 	return names
 }
--- a/index_test.go
+++ b/index_test.go
@@ -11,7 +11,7 @@ import (
 func TestIndex(t *testing.T) {
 	index.load()
 	q := "Oddµ"
-	pages, _ := search(q, 1)
+	pages, _, _ := search(q, 1)
 	assert.NotZero(t, len(pages))
 	for _, p := range pages {
 		assert.NotContains(t, p.Title, "<b>")
@@ -23,7 +23,7 @@ func TestIndex(t *testing.T) {
 func TestSearchHashtag(t *testing.T) {
 	index.load()
 	q := "#Another_Tag"
-	pages, _ := search(q, 1)
+	pages, _, _ := search(q, 1)
 	assert.NotZero(t, len(pages))
 }

@@ -35,7 +35,7 @@ func TestIndexUpdates(t *testing.T) {
 	p.save()

 	// Find the phrase
-	pages, _ := search("This is a test", 1)
+	pages, _, _ := search("This is a test", 1)
 	found := false
 	for _, p := range pages {
 		if p.Name == name {
@@ -46,7 +46,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.True(t, found)

 	// Find the phrase, case insensitive
-	pages, _ = search("this is a test", 1)
+	pages, _, _ = search("this is a test", 1)
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -57,7 +57,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.True(t, found)

 	// Find some words
-	pages, _ = search("this test", 1)
+	pages, _, _ = search("this test", 1)
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -70,7 +70,7 @@ func TestIndexUpdates(t *testing.T) {
 	// Update the page and no longer find it with the old phrase
 	p = &Page{Name: name, Body: []byte("Guvf vf n grfg.")}
 	p.save()
-	pages, _ = search("This is a test", 1)
+	pages, _, _ = search("This is a test", 1)
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
@@ -81,7 +81,7 @@ func TestIndexUpdates(t *testing.T) {
 	assert.False(t, found)

 	// Find page using a new word
-	pages, _ = search("Guvf", 1)
+	pages, _, _ = search("Guvf", 1)
 	found = false
 	for _, p := range pages {
 		if p.Name == name {
--- a/search.go
+++ b/search.go
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"slices"
 	"strconv"
+	"strings"
 	"unicode"
 	"unicode/utf8"
 )
@@ -19,38 +20,48 @@ type Search struct {
 	Previous int
 	Page    int
 	Next    int
+	Last    int
 	More    bool
 	Results bool
 }

-func sortItems(a, b *Page) int {
-	// Sort by score
-	if a.Score < b.Score {
-		return 1
-	} else if a.Score > b.Score {
-		return -1
-	}
-	// If the score is the same and both page names start
-	// with a number (like an ISO date), sort descending.
-	ra, _ := utf8.DecodeRuneInString(a.Title)
-	rb, _ := utf8.DecodeRuneInString(b.Title)
-	if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
-		if a.Title < b.Title {
-			return 1
-		} else if a.Title > b.Title {
+// sortNames returns a sort function that sorts in three stages: 1.
+// whether the query string matches the page title; 2. descending if
+// the page titles start with a digit; 3. otherwise ascending.
+// Access to the index requires a read lock!
+func sortNames(q string) func (a, b string) int {
+	return func (a, b string) int {
+		// If only one page contains the query string, it
+		// takes precedence.
+		ia := strings.Contains(index.titles[a], q)
+		ib := strings.Contains(index.titles[b], q)
+		if (ia && !ib) {
 			return -1
+		} else if (!ia && ib) {
+			return 1
+		}
+		// If both page names start with a number (like an ISO date),
+		// sort descending.
+		ra, _ := utf8.DecodeRuneInString(a)
+		rb, _ := utf8.DecodeRuneInString(b)
+		if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
+			if a < b {
+				return 1
+			} else if a > b {
+				return -1
+			} else {
+				return 0
+			}
+		}
+		// Otherwise sort ascending.
+		if a < b {
+			return -1
+		} else if a > b {
+			return 1
 		} else {
 			return 0
 		}
 	}
-	// Otherwise sort ascending.
-	if a.Title < b.Title {
-		return -1
-	} else if a.Title > b.Title {
-		return 1
-	} else {
-		return 0
-	}
 }

 // load the pages named.
@@ -67,32 +78,35 @@ func load(names []string) []*Page {
 	return items
 }

+// itemsPerPage says how many items to print on a page of search
+// results.
+const itemsPerPage = 20
+
 // search returns a sorted []Page where each page contains an extract
 // of the actual Page.Body in its Page.Html. Page size is 20. The
 // boolean return value indicates whether there are more results.
-func search(q string, page int) ([]*Page, bool) {
+func search(q string, page int) ([]*Page, bool, int) {
 	if len(q) == 0 {
-		return make([]*Page, 0), false
+		return make([]*Page, 0), false, 0
 	}
+	index.RLock()
 	names := searchDocuments(q)
-	items := load(names)
-	for _, p := range items {
-		p.score(q)
-	}
-	slices.SortFunc(items, sortItems)
-	from := 20*(page-1)
+	slices.SortFunc(names, sortNames(q))
+	index.RUnlock()
+	from := itemsPerPage*(page-1)
 	if from > len(names) {
-		return make([]*Page, 0), false
+		return make([]*Page, 0), false, 0
 	}
-	to := from + 20
+	to := from + itemsPerPage
 	if to > len(names) {
 		to = len(names)
 	}
-	items = items[from:to]
+	items := load(names[from:to])
 	for _, p := range items {
+		p.score(q)
 		p.summarize(q)
 	}
-	return items, to < len(names)
+	return items, to < len(names), len(names)/itemsPerPage+1
 }

 // searchHandler presents a search result. It uses the query string in
@@ -104,7 +118,8 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
 	if err != nil {
 		page = 1
 	}
-	items, more := search(q, page)
-	s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Results: len(items) > 0, More: more}
+	items, more, last := search(q, page)
+	s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Last: last,
+		Results: len(items) > 0, More: more}
 	renderTemplate(w, "search", s)
 }
--- a/search.html
+++ b/search.html
@@ -35,6 +35,7 @@ img { max-width: 20%; }
        {{if gt .Page 1}}<a href="/search?q={{.Query}}&page={{.Previous}}">Previous</a>{{end}}
        Page {{.Page}}
        {{if .More}}<a href="/search?q={{.Query}}&page={{.Next}}">Next</a>{{end}}
+        {{if lt .Next .Last}}<a href="/search?q={{.Query}}&page={{.Last}}">Last</a>{{end}}
      {{range .Items}}
      <article lang="{{.Language}}">
        <p><a class="result" href="/view/{{.Name}}">{{.Title}}</a>