Files
oddmu/tokenizer.go
Alex Schroeder 2188f99dea go fmt
2023-09-26 00:07:34 +02:00

57 lines
1.1 KiB
Go

package main
import (
"bytes"
"strings"
"unicode"
"unicode/utf8"
)
// tokenize returns a slice of alphanumeric tokens for the given text.
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
}
// lowercaseFilter returns a slice of lower case tokens.
func lowercaseFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = strings.ToLower(token)
}
return r
}
// tokens returns a slice of alphanumeric tokens.
func tokens(text string) []string {
tokens := tokenize(text)
tokens = lowercaseFilter(tokens)
return tokens
}
// hashtags returns a slice of hashtags.
func hashtags(s []byte) []string {
hashtags := make([]string, 0)
for {
i := bytes.IndexRune(s, '#')
if i == -1 {
return hashtags
}
from := i
i++
for {
r, n := utf8.DecodeRune(s[i:])
if n > 0 && (unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_') {
i += n
} else {
break
}
}
if i > from+1 { // not just "#"
hashtags = append(hashtags, string(bytes.ToLower(s[from:i])))
}
s = s[i:]
}
}