1
0

borked the shitty lexer, tests, new entities

This commit is contained in:
2021-03-04 11:42:31 +01:00
parent e6c5fca61e
commit 70adeedf39
3 changed files with 235 additions and 121 deletions

122
main.go
View File

@ -4,128 +4,8 @@ import (
"bufio"
"fmt"
"os"
"strings"
)
type Token struct {
Type string
Value string
}
func (t Token) String() string {
return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
}
// Lexer embbed a temporary buffer to store "content"
// (url, description, tags) and an array of tokens
type Lexer struct {
Buf string
Tokens []Token
}
func (l *Lexer) AddToken(t string, s string) {
l.Tokens = append(l.Tokens, Token{t, s})
}
// Return last tokenized item, useful to determine context for a symbol
func (l Lexer) LastToken() Token {
var lastToken Token
if len(l.Tokens) > 0 {
lastToken = l.Tokens[len(l.Tokens)-1]
}
return lastToken
}
// Tokenize a given line s from the org file
func (l Lexer) Process(s string) []Token {
for i := range s {
char := string(s[i])
switch char {
case "*":
l.AddToken("HEADER", char)
case "[":
l.AddToken("OBRACKET", char) // this doesnt
case "]":
// non-empty buffer and closing bracket means current state is out of "content" context and buffer can be tokenized
if len(l.Buf) > 0 {
l.AddToken("CONTENT", l.Buf)
l.Buf = ""
l.AddToken("CBRACKET", char)
}
// whitespaces have different meaning given the context : Either separator or part of a content string
case " ":
lt := l.LastToken()
if len(l.Buf) > 0 && lt.Type != "OBRACKET" {
l.AddToken("CONTENT", l.Buf)
l.Buf = ""
l.AddToken("WHITESPACE", char)
}
if i > 0 {
if string(s[i-1]) == " " {
break
}
}
if lt.Type != "WHITESPACE" {
if len(l.Buf) == 0 {
l.AddToken("WHITESPACE", char)
} else {
l.Buf += char
}
}
default:
l.Buf += char
}
}
if len(l.Buf) > 0 {
l.AddToken("CONTENT", l.Buf)
}
return l.Tokens
}
// Only retrieve content tokens, ignores uneeded separators and brackets
func Parse(t []Token) []string {
var content []string
for i := range t {
token := t[i]
if token.Type == "CONTENT" {
content = append(content, token.Value)
}
}
return content
}
// Return the final feed string, depending on either the link has a description, tags or not
func FormatFeed(content []string) string {
var feed string
if len(content) > 1 {
url := content[0]
feed = fmt.Sprintf("%s\n", url)
}
if len(content) > 2 {
url := content[0]
tag := strings.ReplaceAll(content[2], ":", "")
comment := content[1]
feed = fmt.Sprintf("%s %s # %s\n", url, tag, comment)
} else {
url := content[0]
tag := strings.ReplaceAll(content[1], ":", "")
feed = fmt.Sprintf("%s %s\n", url, tag)
}
return feed
}
func IsExistFile(path string) error {
if _, err := os.Stat(path); os.IsNotExist(err) {
message := fmt.Sprintf("File does not exist : %s", err)
@ -170,7 +50,7 @@ func main() {
}
tokens := lexer.Process(scanner.Text())
feed := FormatFeed(Parse(tokens))
feed := Parse(tokens).String()
file.WriteString(feed)
}

57
main_test.go Normal file
View File

@ -0,0 +1,57 @@
package main
import (
"fmt"
"testing"
)
func LexerTestWrapper(message string, expected string) string {
lexer := Lexer{}
return Parse(lexer.Process(message)).String()
}
func LexerTestWrapperFail(expected string, result string) {
fmt.Println("Expected :", expected)
fmt.Println("Got : ", result)
}
func TestLinkNoTag(t *testing.T) {
var message string = "** https://pleroma.social/announcements/feed.xml"
var expected string = "https://pleroma.social/announcements/feed.xml"
result := LexerTestWrapper(message, expected)
if result != expected {
LexerTestWrapperFail(expected, result)
t.Fail()
}
}
func TestLinkTag(t *testing.T) {
var message string = "** https://pleroma.social/announcements/feed.xml :software:"
var expected string = "https://pleroma.social/announcements/feed.xml software"
result := LexerTestWrapper(message, expected)
if result != expected {
LexerTestWrapperFail(expected, result)
t.Fail()
}
}
func TestLinkDescTag(t *testing.T) {
var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:"
var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
var result string = LexerTestWrapper(message, expected)
if result != expected {
LexerTestWrapperFail(expected, result)
t.Fail()
}
}
func TestLinkDescNoTag(t *testing.T) {
var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]]"
var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
var result string = LexerTestWrapper(message, expected)
if result != expected {
LexerTestWrapperFail(expected, result)
t.Fail()
}
}

177
tokenizer.go Normal file
View File

@ -0,0 +1,177 @@
package main
import (
"fmt"
"strings"
)
type Feed struct {
URL string
Description string
Tags []string
}
// Return the final feed string, depending on either the link has a description, tags or not
func (f Feed) String() string {
var ff string
var tags string
if len(f.Tags) > 0 {
for i := range f.Tags {
tags += " " + f.Tags[i]
}
}
if f.Description == "" {
ff = fmt.Sprintf("%s %s", f.URL, tags)
} else {
ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags)
}
return strings.TrimSpace(ff)
}
type Token struct {
Type string
Value string
}
func (t Token) String() string {
return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
}
// Lexer embbed a temporary buffer to store "content"
// (url, description, tags) and an array of tokens
type Lexer struct {
Buf string
Tokens []Token
}
func (l *Lexer) AddToken(t string, s string) {
l.Tokens = append(l.Tokens, Token{t, s})
}
// Return last tokenized item, useful to determine context for a symbol
func (l Lexer) LastToken() Token {
var lastToken Token
if len(l.Tokens) > 0 {
lastToken = l.Tokens[len(l.Tokens)-1]
}
return lastToken
}
func (l Lexer) CountToken(t string) int {
var counter int
for i := range l.Tokens {
if l.Tokens[i].Value == t {
counter ++
}
}
return counter
}
func (l Lexer) IdentifyContent() string {
var tokenType string
if l.CountToken("[") == 2 {
tokenType = "URL"
}
if l.CountToken("[") == 3 {
tokenType = "DESC"
} else {
}
return tokenType
}
// Tokenize a given line s from the org file
func (l Lexer) Process(s string) []Token {
if strings.Count(s, "[") <= 0 {
ss := strings.Split(s, " ")
fmt.Println(ss)
for w := range ss {
if w == 0 {
l.AddToken("URL", ss[w])
} else {
l.AddToken("TAG", ss[w])
}
}
return l.Tokens
}
for i := range s {
char := string(s[i])
switch char {
case "*":
l.AddToken("HEADER", char)
case "[":
l.AddToken("OBRACKET", char)
case "]":
// non-empty buffer and closing bracket means
// current state is out of "content" context and buffer can be tokenized
if len(l.Buf) > 0 {
var tokenType string = l.IdentifyContent()
l.AddToken("CBRACKET", char)
l.AddToken(tokenType, l.Buf)
l.Buf = ""
}
// whitespaces have different meaning given the context :
// Either separator or part of a content string
case " ":
lt := l.LastToken()
if len(l.Buf) > 0 {
l.AddToken(l.IdentifyContent(), l.Buf)
l.Buf = ""
l.AddToken("WHITESPACE", char)
}
if i > 0 {
if string(s[i-1]) == " " {
break
}
}
if lt.Type != "WHITESPACE" {
if len(l.Buf) == 0 {
l.AddToken("WHITESPACE", char)
} else {
l.Buf += char
}
}
default:
l.Buf += char
}
}
if len(l.Buf) > 0 {
l.AddToken(l.IdentifyContent(), l.Buf)
}
return l.Tokens
}
// Only retrieve content tokens, ignores uneeded separators and brackets
func Parse(t []Token) Feed {
var f Feed
for i := range t {
token := t[i]
if token.Type == "URL" {
f.URL = token.Value
}
if token.Type == "DESC" {
f.Description = token.Value
}
if token.Type == "TAG" {
f.Tags = append(f.Tags, token.Value)
}
}
return f
}