3 Commits

Author SHA1 Message Date
Alex Schroeder
2188f99dea go fmt 2023-09-26 00:07:34 +02:00
Alex Schroeder
c063174063 Allow b elements in snippets 2023-09-25 17:20:35 +02:00
Alex Schroeder
e1258da63b Index hashtags and grep the rest 2023-09-25 16:31:33 +02:00
22 changed files with 137 additions and 97 deletions

View File

@@ -2,10 +2,10 @@ package main
import (
"encoding/json"
"log"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
"io"
"log"
"net/http"
"os"
"sync"

View File

@@ -18,7 +18,7 @@ Orange sky above
Reflects a distant fire
It's not `)}
p.save()
data := url.Values{}
data.Set("body", "barbecue")

View File

@@ -1,11 +1,11 @@
package main
import (
"bytes"
"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
"html/template"
"bytes"
"os"
"path"
"time"

View File

@@ -7,7 +7,7 @@ import (
)
func TestFeed(t *testing.T) {
assert.Contains(t,
assert.Contains(t,
assert.HTTPBody(makeHandler(viewHandler, true), "GET", "/view/index.rss", nil),
"Welcome to Oddµ")
}
@@ -16,7 +16,7 @@ func TestFeed(t *testing.T) {
func TestFeedItems(t *testing.T) {
_ = os.RemoveAll("testdata")
index.load()
p1 := &Page{Name: "testdata/cactus", Body: []byte(`# Cactus
Green head and white hair
A bench in the evening sun

1
go.mod
View File

@@ -4,7 +4,6 @@ go 1.21.0
require (
github.com/anthonynsimon/bild v0.13.0
github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0
github.com/gomarkdown/markdown v0.0.0-20230912175223-14b07df9d538
github.com/google/subcommands v1.2.0
github.com/hexops/gotextdiff v1.0.3

2
go.sum
View File

@@ -10,8 +10,6 @@ github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0 h1:b+7JSiBM+hnLQjP/lXztks5hnLt1PS46hktG9VOJgzo=
github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0/go.mod h1:qzKC/DpcxK67zaSHdCmIv3L9WJViHVinYXN2S7l3RM8=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/gomarkdown/markdown v0.0.0-20230912175223-14b07df9d538 h1:ePDpFu7l0QUV46/9A7icfL2wvIOzTJLCWh4RO2NECzE=

View File

@@ -30,7 +30,7 @@ func (cmd *htmlCmd) Execute(_ context.Context, f *flag.FlagSet, _ ...interface{}
}
func htmlCli(w io.Writer, useTemplate bool, args []string) subcommands.ExitStatus {
for _, arg := range args {
for _, arg := range args {
p, err := loadPage(arg)
if err != nil {
fmt.Fprintf(w, "Cannot load %s: %s\n", arg, err)

View File

@@ -2,9 +2,9 @@ package main
import (
"bytes"
"github.com/google/subcommands"
"github.com/stretchr/testify/assert"
"testing"
"github.com/google/subcommands"
)
func TestHtmlCmd(t *testing.T) {

View File

@@ -4,10 +4,10 @@
package main
import(
import (
"io/fs"
"path/filepath"
"log"
"path/filepath"
"sort"
"strings"
"sync"
@@ -44,9 +44,10 @@ func (idx *Index) reset() {
// addDocument adds the text as a new document. This assumes that the
// index is locked!
func (idx *Index) addDocument(text string) docid {
id := idx.next_id; idx.next_id++
for _, token := range tokens(text) {
func (idx *Index) addDocument(text []byte) docid {
id := idx.next_id
idx.next_id++
for _, token := range hashtags(text) {
ids := idx.token[token]
// Don't add same ID more than once. Checking the last
// position of the []docid works because the id is
@@ -61,8 +62,8 @@ func (idx *Index) addDocument(text string) docid {
// deleteDocument deletes the text as a new document. The id can no
// longer be used. This assumes that the index is locked!
func (idx *Index) deleteDocument(text string, id docid) {
for _, token := range tokens(text) {
func (idx *Index) deleteDocument(text []byte, id docid) {
for _, token := range hashtags(text) {
ids := index.token[token]
// Tokens can appear multiple times in a text but they
// can only be deleted once. deleted.
@@ -106,7 +107,7 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
}
p.handleTitle(false)
id := idx.addDocument(string(p.Body))
id := idx.addDocument(p.Body)
idx.documents[id] = p.Name
idx.titles[p.Name] = p.Title
return nil
@@ -136,7 +137,7 @@ func (idx *Index) dump() {
for token, ids := range idx.token {
log.Printf("%s: %v", token, ids)
}
}
}
// updateIndex updates the index for a single page. The old text is
// loaded from the disk and removed from the index first, if it
@@ -153,16 +154,16 @@ func (p *Page) updateIndex() {
}
}
if id == 0 {
id = index.addDocument(string(p.Body))
id = index.addDocument(p.Body)
index.documents[id] = p.Name
index.titles[p.Name] = p.Title
} else {
if o, err := loadPage(p.Name); err == nil {
index.deleteDocument(string(o.Body), id)
index.deleteDocument(o.Body, id)
}
// Do not reuse the old id. We need a new one for
// indexing to work.
id = index.addDocument(string(p.Body))
id = index.addDocument(p.Body)
index.documents[id] = p.Name
p.handleTitle(false)
// The page name stays the same but the title may have
@@ -171,7 +172,6 @@ func (p *Page) updateIndex() {
}
}
// removeFromIndex removes the page from the index. Do this when
// deleting a page.
func (p *Page) removeFromIndex() {
@@ -194,32 +194,59 @@ func (p *Page) removeFromIndex() {
log.Printf("Page %s cannot removed from the index: %s", p.Name, err)
return
}
index.deleteDocument(string(o.Body), id)
index.deleteDocument(o.Body, id)
}
// searchDocuments searches the index for a query string and returns
// page names.
// search searches the index for a query string and returns page
// names.
func (idx *Index) search(q string) []string {
index.RLock()
defer index.RUnlock()
var r []docid
for _, token := range tokens(q) {
if ids, ok := idx.token[token]; ok {
if r == nil {
r = ids
names := make([]string, 0)
hashtags := hashtags([]byte(q))
if len(hashtags) > 0 {
var r []docid
for _, token := range hashtags {
if ids, ok := idx.token[token]; ok {
if r == nil {
r = ids
} else {
r = intersection(r, ids)
}
} else {
r = intersection(r, ids)
// Token doesn't exist therefore abort search.
return nil
}
} else {
// Token doesn't exist therefore abort search.
return nil
}
for _, id := range r {
names = append(names, idx.documents[id])
}
} else {
for _, name := range idx.documents {
names = append(names, name)
}
}
names := make([]string, 0)
for _, id := range r {
names = append(names, idx.documents[id])
return grep(tokens(q), names)
}
func grep(tokens, names []string) []string {
results := make([]string, 0)
NameLoop:
for _, name := range names {
p, err := loadPage(name)
if err != nil {
log.Printf("Cannot load %s: %s", name, err)
continue
}
body := strings.ToLower(string(p.Body))
for _, token := range tokens {
if !strings.Contains(body, token) {
continue NameLoop
}
}
results = append(results, name)
}
return names
return results
}
// intersection returns the set intersection between a and b.

10
page.go
View File

@@ -2,9 +2,9 @@ package main
import (
"bytes"
"log"
"github.com/microcosm-cc/bluemonday"
"html/template"
"log"
"net/url"
"os"
"path/filepath"
@@ -27,10 +27,12 @@ type Page struct {
Hashtags []string
}
// santize uses bluemonday to sanitize the HTML.
// No exceptions are made because this is used for snippets.
// santize uses bluemonday to sanitize the HTML. An exceptions is made
// for the b tag because this is used for snippets.
func sanitizeStrict(s string) template.HTML {
return template.HTML(bluemonday.StrictPolicy().Sanitize(s))
policy := bluemonday.StrictPolicy()
policy.AllowElements("b")
return template.HTML(policy.Sanitize(s))
}
// santizeBytes uses bluemonday to sanitize the HTML.

View File

@@ -35,7 +35,7 @@ func wikiLink(p *parser.Parser, fn func(p *parser.Parser, data []byte, offset in
// hashtag returns an inline parser function. This indirection is
// required because we want to receive an array of hashtags found.
func hashtag() (func(p *parser.Parser, data []byte, offset int) (int, ast.Node), *[]string) {
hashtags := make([]string,0)
hashtags := make([]string, 0)
return func(p *parser.Parser, data []byte, offset int) (int, ast.Node) {
data = data[offset:]
i := 0
@@ -49,8 +49,8 @@ func hashtag() (func(p *parser.Parser, data []byte, offset int) (int, ast.Node),
hashtags = append(hashtags, string(data[1:i]))
link := &ast.Link{
AdditionalAttributes: []string{`class="tag"`},
Destination: append([]byte("/search?q=%23"), data[1:i]...),
Title: data[0:i],
Destination: append([]byte("/search?q=%23"), data[1:i]...),
Title: data[0:i],
}
text := bytes.ReplaceAll(data[0:i], []byte("_"), []byte(" "))
ast.AppendChild(link, &ast.Text{Leaf: ast.Leaf{Literal: text}})

View File

@@ -2,8 +2,8 @@ package main
import (
"bytes"
"github.com/stretchr/testify/assert"
"github.com/google/subcommands"
"github.com/stretchr/testify/assert"
"os"
"testing"
)
@@ -32,7 +32,7 @@ You are no planet`)}
1 change was made.
This is a dry run. Use -confirm to make it happen.
`
b := new(bytes.Buffer)
s := replaceCli(b, false, []string{`\bno planet`, `planetoid`})
assert.Equal(t, subcommands.ExitSuccess, s)

View File

@@ -1,21 +1,21 @@
package main
import (
"bytes"
"context"
"flag"
"fmt"
"github.com/google/subcommands"
"io"
"os"
"strings"
"bytes"
"slices"
"path/filepath"
"io/fs"
"os"
"path/filepath"
"slices"
"strings"
)
type searchCmd struct {
page int
page int
exact bool
}
@@ -42,8 +42,8 @@ func (cmd *searchCmd) Execute(_ context.Context, f *flag.FlagSet, _ ...interface
// searchCli runs the search command on the command line. It is used
// here with an io.Writer for easy testing.
func searchCli(w io.Writer, n int, exact bool, args []string) subcommands.ExitStatus {
var fn func(q string, n int) ([]*Page, bool, int);
if (exact) {
var fn func(q string, n int) ([]*Page, bool, int)
if exact {
fn = searchExact
} else {
index.load()
@@ -75,7 +75,7 @@ func searchExact(q string, page int) ([]*Page, bool, int) {
pages := make(map[string]*Page)
names := make([]string, 0)
index.titles = make(map[string]string)
err := filepath.Walk(".", func (path string, info fs.FileInfo, err error) error {
err := filepath.Walk(".", func(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
@@ -112,7 +112,7 @@ func searchExact(q string, page int) ([]*Page, bool, int) {
to = len(names)
}
items := make([]*Page, 0)
for i := from; i<to; i++ {
for i := from; i < to; i++ {
p := pages[names[i]]
p.score(q)
p.summarize(q)

View File

@@ -2,9 +2,9 @@ package main
import (
"bytes"
"github.com/google/subcommands"
"github.com/stretchr/testify/assert"
"testing"
"github.com/google/subcommands"
)
func TestSearchCmd(t *testing.T) {

View File

@@ -3,8 +3,8 @@ package main
import (
"github.com/stretchr/testify/assert"
"net/url"
"testing"
"os"
"testing"
)
func TestSearch(t *testing.T) {
@@ -26,7 +26,7 @@ The silence streches.`)}
data := url.Values{}
data.Set("q", "look")
body := assert.HTTPBody(searchHandler, "GET", "/search", data)
assert.Contains(t, body, "We look")
assert.Contains(t, body, "We <b>look</b>")
assert.NotContains(t, body, "Odd?")
assert.Contains(t, body, "Even?")
}

View File

@@ -1,20 +1,21 @@
package main
import (
"log"
"regexp"
"strings"
)
// re returns a regular expression matching any word in q.
func re(q string) (*regexp.Regexp, error) {
q = regexp.QuoteMeta(q)
re, err := regexp.Compile(`\s+`)
if err != nil {
return nil, err
fields := strings.Fields(q)
quoted := make([]string, len(fields))
for i, w := range fields {
quoted[i] = regexp.QuoteMeta(w)
}
words := re.ReplaceAllString(q, "|")
re, err = regexp.Compile(`(?i)(` + words + `)`)
re, err := regexp.Compile(`(?i)(` + strings.Join(quoted, "|") + `)`)
if err != nil {
log.Printf("Cannot compile %s %v: %s", q, quoted, err)
return nil, err
}
return re, nil
@@ -26,12 +27,16 @@ func snippets(q string, s string) string {
maxsnippets := 4
re, err := re(q)
// If the compilation didn't work, truncate and return
if err != nil || len(s) <= snippetlen {
if err != nil {
if len(s) > 400 {
s = s[0:400] + " …"
}
return s
}
// Short cut for short pages
if len(s) <= snippetlen {
return highlight(q, re, s)
}
// show a snippet from the beginning of the document
j := strings.LastIndex(s[:snippetlen], " ")
if j == -1 {

View File

@@ -1,32 +1,19 @@
package main
import (
"bytes"
"strings"
"unicode"
"unicode/utf8"
)
// tokenize returns a slice of tokens for the given text.
// tokenize returns a slice of alphanumeric tokens for the given text.
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
// Split on any character that is not a letter or a
// number, not the hash sign (for hash tags)
return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '#'
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
}
// shortWordFilter removes all the words three characters or less
// except for all caps words like USA, EUR, CHF and the like.
func shortWordFilter(tokens []string) []string {
r := make([]string, 0, len(tokens))
for _, token := range tokens {
if len(token) > 3 ||
len(token) == 3 && token == strings.ToUpper(token) {
r = append(r, token)
}
}
return r
}
// lowercaseFilter returns a slice of lower case tokens.
func lowercaseFilter(tokens []string) []string {
r := make([]string, len(tokens))
@@ -36,10 +23,34 @@ func lowercaseFilter(tokens []string) []string {
return r
}
// tokens returns a slice of tokens.
// tokens returns a slice of alphanumeric tokens.
func tokens(text string) []string {
tokens := tokenize(text)
tokens = shortWordFilter(tokens)
tokens = lowercaseFilter(tokens)
return tokens
}
// hashtags returns a slice of hashtags.
func hashtags(s []byte) []string {
hashtags := make([]string, 0)
for {
i := bytes.IndexRune(s, '#')
if i == -1 {
return hashtags
}
from := i
i++
for {
r, n := utf8.DecodeRune(s[i:])
if n > 0 && (unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_') {
i += n
} else {
break
}
}
if i > from+1 { // not just "#"
hashtags = append(hashtags, string(bytes.ToLower(s[from:i])))
}
s = s[i:]
}
}

View File

@@ -1,15 +1,13 @@
package main
import (
"testing"
"github.com/stretchr/testify/assert"
"testing"
)
func TestTokenizer(t *testing.T) {
assert.EqualValues(t, []string{}, tokens(""), "empty string")
assert.EqualValues(t, []string{}, tokens("the a"), "no short words")
assert.EqualValues(t, []string{"chf"}, tokens("CHF"), "three letter acronyms")
assert.EqualValues(t, []string{}, tokens("CH"), "no two letter acronyms")
assert.EqualValues(t, []string{"franc"}, tokens("Franc"), "lower case")
assert.EqualValues(t, []string{"know", "what"}, tokens("I don't know what to do."))
assert.EqualValues(t, []string{"i", "don", "t", "know", "what", "to", "do"}, tokens("I don't know what to do."))
assert.EqualValues(t, []string{"#truth"}, hashtags([]byte("This is boring. #Truth")), "hashtags")
}

View File

@@ -3,8 +3,8 @@ package main
import (
"net/http"
"os"
"time"
"strings"
"time"
)
// rootHandler just redirects to /view/index.
@@ -31,7 +31,7 @@ func viewHandler(w http.ResponseWriter, r *http.Request, name string) {
file = false
if strings.HasSuffix(fn, ".rss") {
rss = true
name = fn[0:len(fn)-4]
name = fn[0 : len(fn)-4]
fn = name
}
fn += ".md"

View File

@@ -88,14 +88,13 @@ I like spring better
assert.NoError(t, err)
h := makeHandler(viewHandler, true)
assert.Equal(t, []string{fi.ModTime().UTC().Format(http.TimeFormat)},
HTTPHeaders(h, "GET", "/view/testdata/now", nil, "Last-Modified"))
HTTPHeaders(h, "GET", "/view/testdata/now", nil, "Last-Modified"))
HTTPStatusCodeIfModifiedSince(t, h, "/view/testdata/now", fi.ModTime())
t.Cleanup(func() {
_ = os.RemoveAll("testdata")
})
}
// wipes testdata
func TestPageHead(t *testing.T) {
_ = os.RemoveAll("testdata")

View File

@@ -3,9 +3,9 @@ package main
import (
"context"
"flag"
"log"
"github.com/google/subcommands"
"html/template"
"log"
"net/http"
"os"
"regexp"

View File

@@ -66,6 +66,7 @@ func HTTPUploadAndRedirectTo(t *testing.T, handler http.HandlerFunc, url, conten
"Expected HTTP redirect location %s for %q but received %v", destination, url, headers)
return isRedirectCode
}
// HTTPStatusCodeIfModifiedSince checks that the request results in a
// 304 response for the given time.
func HTTPStatusCodeIfModifiedSince(t *testing.T, handler http.HandlerFunc, url string, ti time.Time) {