forked from mirror/oddmu
Compare commits
1 Commits
v0.9
...
no-scoring
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
153a179d92 |
26
README.md
26
README.md
@@ -119,15 +119,13 @@ is a byte array and that's why we need to call `printf`).
|
||||
|
||||
For the `search.html` template only:
|
||||
|
||||
`{{.Page}}` is the page number in the results.
|
||||
|
||||
`{{.Previous}}` and `{{.Next}} are the previous and next page number
|
||||
in the results since doing arithmetics in templates is hard. The first
|
||||
page number is 1.
|
||||
`{{.Previous}}`, `{{.Page}}`, `{{.Next}}` and `{{.Last}}` are the
|
||||
previous, current, next and last page number in the results since
|
||||
doing arithmetics in templates is hard. The first page number is 1.
|
||||
|
||||
`{{.More}}` indicates if there are any more search results.
|
||||
|
||||
`{{.Results}}` indicates if there were any search results.
|
||||
`{{.Results}}` indicates if there were any search results at all.
|
||||
|
||||
`{{.Items}}` is an array of pages, each containing a search result. A
|
||||
search result is a page (with the properties seen above). Thus, to
|
||||
@@ -441,6 +439,22 @@ and "rail", a search for "mail" returns a match because the trigrams
|
||||
"mai" and "ail" are found. In this situation, the result has a score
|
||||
of 0.
|
||||
|
||||
The sorting of all the pages, however, does not depend on scoring!
|
||||
Computing the score is expensive because the page must be loaded from
|
||||
disk. Therefore, results are sorted by title:
|
||||
|
||||
- If the page title contains the query string, it gets sorted first.
|
||||
- If the page title begins with a number, it is sorted descending.
|
||||
- All other pages follow, sorted ascending.
|
||||
|
||||
The effect is that first, the pages with matches in the page title are
|
||||
shown, and then all the others. Within these two groups, the most
|
||||
recent blog posts are shown first, if and only if the page title
|
||||
begins with an ISO date like 2023-09-16.
|
||||
|
||||
The score and highlighting of snippets is used to help visitors decide
|
||||
which links to click.
|
||||
|
||||
## Limitations
|
||||
|
||||
Page titles are filenames with `.md` appended. If your filesystem
|
||||
|
||||
@@ -17,7 +17,7 @@ func commands() {
|
||||
} else if len(os.Args) > 2 && os.Args[1] == "search" {
|
||||
index.load()
|
||||
for _, q := range os.Args[2:] {
|
||||
items, more := search(q, 1)
|
||||
items, more, _ := search(q, 1)
|
||||
fmt.Printf("Search %s: %d results\n", q, len(items))
|
||||
for _, p := range items {
|
||||
fmt.Printf("* %s (%d)\n", p.Title, p.Score)
|
||||
|
||||
@@ -10,6 +10,6 @@ func TestLoadAndSearch(t *testing.T) {
|
||||
index.reset()
|
||||
go index.load()
|
||||
q := "Oddµ"
|
||||
pages, _ := search(q, 1)
|
||||
pages, _, _ := search(q, 1)
|
||||
assert.Zero(t, len(pages))
|
||||
}
|
||||
|
||||
15
index.go
15
index.go
@@ -21,6 +21,9 @@ type Index struct {
|
||||
// documents is a map, mapping document ids of the index to
|
||||
// page names.
|
||||
documents map[trigram.DocID]string
|
||||
|
||||
// names is a map, mapping page names to titles.
|
||||
titles map[string]string
|
||||
}
|
||||
|
||||
// idx is the global Index per wiki.
|
||||
@@ -30,6 +33,7 @@ var index Index
|
||||
func (idx *Index) reset() {
|
||||
idx.index = nil
|
||||
idx.documents = nil
|
||||
idx.titles = nil
|
||||
}
|
||||
|
||||
// add reads a file and adds it to the index. This must happen while
|
||||
@@ -47,8 +51,10 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p.handleTitle(false)
|
||||
id := idx.index.Add(strings.ToLower(string(p.Body)))
|
||||
idx.documents[id] = p.Name
|
||||
idx.titles[p.Name] = p.Title
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -59,6 +65,7 @@ func (idx *Index) load() (int, error) {
|
||||
defer idx.Unlock()
|
||||
idx.index = make(trigram.Index)
|
||||
idx.documents = make(map[trigram.DocID]string)
|
||||
idx.titles = make(map[string]string)
|
||||
err := filepath.Walk(".", idx.add)
|
||||
if err != nil {
|
||||
idx.reset()
|
||||
@@ -90,23 +97,27 @@ func (p *Page) updateIndex() {
|
||||
o, err := loadPage(p.Name)
|
||||
if err == nil {
|
||||
index.index.Delete(strings.ToLower(string(o.Body)), id)
|
||||
o.handleTitle(false)
|
||||
delete(index.titles, o.Title)
|
||||
}
|
||||
index.index.Insert(strings.ToLower(string(p.Body)), id)
|
||||
p.handleTitle(false)
|
||||
index.titles[p.Name] = p.Title
|
||||
}
|
||||
}
|
||||
|
||||
// searchDocuments searches the index for a string. This requires the
|
||||
// index to be locked.
|
||||
func searchDocuments(q string) []string {
|
||||
words := strings.Fields(strings.ToLower(q))
|
||||
var trigrams []trigram.T
|
||||
for _, word := range words {
|
||||
trigrams = trigram.Extract(word, trigrams)
|
||||
}
|
||||
index.RLock()
|
||||
ids := index.index.QueryTrigrams(trigrams)
|
||||
names := make([]string, len(ids))
|
||||
for i, id := range ids {
|
||||
names[i] = index.documents[id]
|
||||
}
|
||||
index.RUnlock()
|
||||
return names
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
func TestIndex(t *testing.T) {
|
||||
index.load()
|
||||
q := "Oddµ"
|
||||
pages, _ := search(q, 1)
|
||||
pages, _, _ := search(q, 1)
|
||||
assert.NotZero(t, len(pages))
|
||||
for _, p := range pages {
|
||||
assert.NotContains(t, p.Title, "<b>")
|
||||
@@ -23,7 +23,7 @@ func TestIndex(t *testing.T) {
|
||||
func TestSearchHashtag(t *testing.T) {
|
||||
index.load()
|
||||
q := "#Another_Tag"
|
||||
pages, _ := search(q, 1)
|
||||
pages, _, _ := search(q, 1)
|
||||
assert.NotZero(t, len(pages))
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ func TestIndexUpdates(t *testing.T) {
|
||||
p.save()
|
||||
|
||||
// Find the phrase
|
||||
pages, _ := search("This is a test", 1)
|
||||
pages, _, _ := search("This is a test", 1)
|
||||
found := false
|
||||
for _, p := range pages {
|
||||
if p.Name == name {
|
||||
@@ -46,7 +46,7 @@ func TestIndexUpdates(t *testing.T) {
|
||||
assert.True(t, found)
|
||||
|
||||
// Find the phrase, case insensitive
|
||||
pages, _ = search("this is a test", 1)
|
||||
pages, _, _ = search("this is a test", 1)
|
||||
found = false
|
||||
for _, p := range pages {
|
||||
if p.Name == name {
|
||||
@@ -57,7 +57,7 @@ func TestIndexUpdates(t *testing.T) {
|
||||
assert.True(t, found)
|
||||
|
||||
// Find some words
|
||||
pages, _ = search("this test", 1)
|
||||
pages, _, _ = search("this test", 1)
|
||||
found = false
|
||||
for _, p := range pages {
|
||||
if p.Name == name {
|
||||
@@ -70,7 +70,7 @@ func TestIndexUpdates(t *testing.T) {
|
||||
// Update the page and no longer find it with the old phrase
|
||||
p = &Page{Name: name, Body: []byte("Guvf vf n grfg.")}
|
||||
p.save()
|
||||
pages, _ = search("This is a test", 1)
|
||||
pages, _, _ = search("This is a test", 1)
|
||||
found = false
|
||||
for _, p := range pages {
|
||||
if p.Name == name {
|
||||
@@ -81,7 +81,7 @@ func TestIndexUpdates(t *testing.T) {
|
||||
assert.False(t, found)
|
||||
|
||||
// Find page using a new word
|
||||
pages, _ = search("Guvf", 1)
|
||||
pages, _, _ = search("Guvf", 1)
|
||||
found = false
|
||||
for _, p := range pages {
|
||||
if p.Name == name {
|
||||
|
||||
89
search.go
89
search.go
@@ -5,6 +5,7 @@ import (
|
||||
"net/http"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
@@ -19,38 +20,48 @@ type Search struct {
|
||||
Previous int
|
||||
Page int
|
||||
Next int
|
||||
Last int
|
||||
More bool
|
||||
Results bool
|
||||
}
|
||||
|
||||
func sortItems(a, b *Page) int {
|
||||
// Sort by score
|
||||
if a.Score < b.Score {
|
||||
return 1
|
||||
} else if a.Score > b.Score {
|
||||
return -1
|
||||
}
|
||||
// If the score is the same and both page names start
|
||||
// with a number (like an ISO date), sort descending.
|
||||
ra, _ := utf8.DecodeRuneInString(a.Title)
|
||||
rb, _ := utf8.DecodeRuneInString(b.Title)
|
||||
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
|
||||
if a.Title < b.Title {
|
||||
return 1
|
||||
} else if a.Title > b.Title {
|
||||
// sortNames returns a sort function that sorts in three stages: 1.
|
||||
// whether the query string matches the page title; 2. descending if
|
||||
// the page titles start with a digit; 3. otherwise ascending.
|
||||
// Access to the index requires a read lock!
|
||||
func sortNames(q string) func (a, b string) int {
|
||||
return func (a, b string) int {
|
||||
// If only one page contains the query string, it
|
||||
// takes precedence.
|
||||
ia := strings.Contains(index.titles[a], q)
|
||||
ib := strings.Contains(index.titles[b], q)
|
||||
if (ia && !ib) {
|
||||
return -1
|
||||
} else if (!ia && ib) {
|
||||
return 1
|
||||
}
|
||||
// If both page names start with a number (like an ISO date),
|
||||
// sort descending.
|
||||
ra, _ := utf8.DecodeRuneInString(a)
|
||||
rb, _ := utf8.DecodeRuneInString(b)
|
||||
if unicode.IsNumber(ra) && unicode.IsNumber(rb) {
|
||||
if a < b {
|
||||
return 1
|
||||
} else if a > b {
|
||||
return -1
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
// Otherwise sort ascending.
|
||||
if a < b {
|
||||
return -1
|
||||
} else if a > b {
|
||||
return 1
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
// Otherwise sort ascending.
|
||||
if a.Title < b.Title {
|
||||
return -1
|
||||
} else if a.Title > b.Title {
|
||||
return 1
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// load the pages named.
|
||||
@@ -67,32 +78,35 @@ func load(names []string) []*Page {
|
||||
return items
|
||||
}
|
||||
|
||||
// itemsPerPage says how many items to print on a page of search
|
||||
// results.
|
||||
const itemsPerPage = 20
|
||||
|
||||
// search returns a sorted []Page where each page contains an extract
|
||||
// of the actual Page.Body in its Page.Html. Page size is 20. The
|
||||
// boolean return value indicates whether there are more results.
|
||||
func search(q string, page int) ([]*Page, bool) {
|
||||
func search(q string, page int) ([]*Page, bool, int) {
|
||||
if len(q) == 0 {
|
||||
return make([]*Page, 0), false
|
||||
return make([]*Page, 0), false, 0
|
||||
}
|
||||
index.RLock()
|
||||
names := searchDocuments(q)
|
||||
items := load(names)
|
||||
for _, p := range items {
|
||||
p.score(q)
|
||||
}
|
||||
slices.SortFunc(items, sortItems)
|
||||
from := 20*(page-1)
|
||||
slices.SortFunc(names, sortNames(q))
|
||||
index.RUnlock()
|
||||
from := itemsPerPage*(page-1)
|
||||
if from > len(names) {
|
||||
return make([]*Page, 0), false
|
||||
return make([]*Page, 0), false, 0
|
||||
}
|
||||
to := from + 20
|
||||
to := from + itemsPerPage
|
||||
if to > len(names) {
|
||||
to = len(names)
|
||||
}
|
||||
items = items[from:to]
|
||||
items := load(names[from:to])
|
||||
for _, p := range items {
|
||||
p.score(q)
|
||||
p.summarize(q)
|
||||
}
|
||||
return items, to < len(names)
|
||||
return items, to < len(names), len(names)/itemsPerPage+1
|
||||
}
|
||||
|
||||
// searchHandler presents a search result. It uses the query string in
|
||||
@@ -104,7 +118,8 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
|
||||
if err != nil {
|
||||
page = 1
|
||||
}
|
||||
items, more := search(q, page)
|
||||
s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Results: len(items) > 0, More: more}
|
||||
items, more, last := search(q, page)
|
||||
s := &Search{Query: q, Items: items, Previous: page-1, Page: page, Next: page+1, Last: last,
|
||||
Results: len(items) > 0, More: more}
|
||||
renderTemplate(w, "search", s)
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ img { max-width: 20%; }
|
||||
{{if gt .Page 1}}<a href="/search?q={{.Query}}&page={{.Previous}}">Previous</a>{{end}}
|
||||
Page {{.Page}}
|
||||
{{if .More}}<a href="/search?q={{.Query}}&page={{.Next}}">Next</a>{{end}}
|
||||
{{if lt .Next .Last}}<a href="/search?q={{.Query}}&page={{.Last}}">Last</a>{{end}}
|
||||
{{range .Items}}
|
||||
<article lang="{{.Language}}">
|
||||
<p><a class="result" href="/view/{{.Name}}">{{.Title}}</a>
|
||||
|
||||
Reference in New Issue
Block a user