regex after all heh
This commit is contained in:
@ -26,8 +26,8 @@ func TestLinkNoTag(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestLinkTag(t *testing.T) {
|
func TestLinkTag(t *testing.T) {
|
||||||
var message string = "** https://pleroma.social/announcements/feed.xml :software:"
|
var message string = "** https://pleroma.social/announcements/feed.xml :software: :social:"
|
||||||
var expected string = "https://pleroma.social/announcements/feed.xml software"
|
var expected string = "https://pleroma.social/announcements/feed.xml software social"
|
||||||
result := LexerTestWrapper(message, expected)
|
result := LexerTestWrapper(message, expected)
|
||||||
if result != expected {
|
if result != expected {
|
||||||
LexerTestWrapperFail(expected, result)
|
LexerTestWrapperFail(expected, result)
|
||||||
@ -36,8 +36,8 @@ func TestLinkTag(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestLinkDescTag(t *testing.T) {
|
func TestLinkDescTag(t *testing.T) {
|
||||||
var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:"
|
var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:"
|
||||||
var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
|
var expected string = "https://pleroma.social/announcements/feed.xml software # Pleroma Social"
|
||||||
var result string = LexerTestWrapper(message, expected)
|
var result string = LexerTestWrapper(message, expected)
|
||||||
if result != expected {
|
if result != expected {
|
||||||
LexerTestWrapperFail(expected, result)
|
LexerTestWrapperFail(expected, result)
|
||||||
|
|||||||
115
tokenizer.go
115
tokenizer.go
@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -14,18 +15,12 @@ type Feed struct {
|
|||||||
// Return the final feed string, depending on either the link has a description, tags or not
|
// Return the final feed string, depending on either the link has a description, tags or not
|
||||||
func (f Feed) String() string {
|
func (f Feed) String() string {
|
||||||
var ff string
|
var ff string
|
||||||
var tags string
|
var tags string = strings.TrimSpace(strings.Join(f.Tags, " "))
|
||||||
|
|
||||||
if len(f.Tags) > 0 {
|
ff = fmt.Sprintf("%s %s", f.URL, tags)
|
||||||
for i := range f.Tags {
|
|
||||||
tags += " " + f.Tags[i]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if f.Description == "" {
|
if f.Description != "" {
|
||||||
ff = fmt.Sprintf("%s %s", f.URL, tags)
|
ff = fmt.Sprintf("%s # %s", strings.TrimSpace(ff), f.Description)
|
||||||
} else {
|
|
||||||
ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return strings.TrimSpace(ff)
|
return strings.TrimSpace(ff)
|
||||||
@ -71,91 +66,49 @@ func (l Lexer) CountToken(t string) int {
|
|||||||
return counter
|
return counter
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l Lexer) IdentifyContent() string {
|
func (l *Lexer) ProcessSimpleLink(s string) {
|
||||||
var tokenType string
|
ss := strings.Split(s[3:], " ")
|
||||||
if l.CountToken("[") == 2 {
|
for w := range ss {
|
||||||
tokenType = "URL"
|
if w == 0 {
|
||||||
|
url := strings.TrimSpace(ss[w])
|
||||||
|
l.AddToken("URL", url)
|
||||||
|
} else {
|
||||||
|
if ss[w] != "" && ss[w] != " " {
|
||||||
|
tag := strings.ReplaceAll(ss[w], ":", "")
|
||||||
|
l.AddToken("TAG", tag)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if l.CountToken("[") == 3 {
|
|
||||||
tokenType = "DESC"
|
|
||||||
} else {
|
|
||||||
|
|
||||||
}
|
|
||||||
return tokenType
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize a given line s from the org file
|
// Tokenize a given line s from the org file
|
||||||
func (l Lexer) Process(s string) []Token {
|
func (l Lexer) Process(s string) []Token {
|
||||||
if strings.Count(s, "[") <= 0 {
|
if s[3] != '[' {
|
||||||
ss := strings.Split(s, " ")
|
l.ProcessSimpleLink(s)
|
||||||
fmt.Println(ss)
|
|
||||||
for w := range ss {
|
|
||||||
if w == 0 {
|
|
||||||
l.AddToken("URL", ss[w])
|
|
||||||
} else {
|
|
||||||
l.AddToken("TAG", ss[w])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return l.Tokens
|
return l.Tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range s {
|
re := regexp.MustCompile(`(?:\[\[)(?P<url>\S+)(?:\]\[)(?P<desc>.+)(?:\]\])(?P<tags>.+)?`)
|
||||||
char := string(s[i])
|
matches := re.FindStringSubmatch(s)
|
||||||
switch char {
|
if len(matches) > 1 {
|
||||||
case "*":
|
l.AddToken("URL", strings.TrimSpace(matches[1]))
|
||||||
l.AddToken("HEADER", char)
|
|
||||||
|
|
||||||
case "[":
|
|
||||||
l.AddToken("OBRACKET", char)
|
|
||||||
|
|
||||||
case "]":
|
|
||||||
// non-empty buffer and closing bracket means
|
|
||||||
// current state is out of "content" context and buffer can be tokenized
|
|
||||||
if len(l.Buf) > 0 {
|
|
||||||
var tokenType string = l.IdentifyContent()
|
|
||||||
l.AddToken("CBRACKET", char)
|
|
||||||
l.AddToken(tokenType, l.Buf)
|
|
||||||
l.Buf = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// whitespaces have different meaning given the context :
|
|
||||||
// Either separator or part of a content string
|
|
||||||
case " ":
|
|
||||||
lt := l.LastToken()
|
|
||||||
if len(l.Buf) > 0 {
|
|
||||||
l.AddToken(l.IdentifyContent(), l.Buf)
|
|
||||||
l.Buf = ""
|
|
||||||
l.AddToken("WHITESPACE", char)
|
|
||||||
}
|
|
||||||
|
|
||||||
if i > 0 {
|
|
||||||
if string(s[i-1]) == " " {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if lt.Type != "WHITESPACE" {
|
|
||||||
if len(l.Buf) == 0 {
|
|
||||||
l.AddToken("WHITESPACE", char)
|
|
||||||
} else {
|
|
||||||
l.Buf += char
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
l.Buf += char
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if len(matches) > 2 {
|
||||||
if len(l.Buf) > 0 {
|
l.AddToken("DESC", strings.TrimSpace(matches[2]))
|
||||||
l.AddToken(l.IdentifyContent(), l.Buf)
|
}
|
||||||
|
if len(matches) > 3 {
|
||||||
|
tags := strings.Split(matches[3], " ")
|
||||||
|
for t := range tags {
|
||||||
|
if tags[t] != "" && tags[t] != " " {
|
||||||
|
tag := strings.ReplaceAll(tags[t], ":", "")
|
||||||
|
l.AddToken("TAG", strings.TrimSpace(tag))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return l.Tokens
|
return l.Tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only retrieve content tokens, ignores uneeded separators and brackets
|
|
||||||
func Parse(t []Token) Feed {
|
func Parse(t []Token) Feed {
|
||||||
var f Feed
|
var f Feed
|
||||||
for i := range t {
|
for i := range t {
|
||||||
|
|||||||
Reference in New Issue
Block a user