org2newsboat/tokenizer.go

package main

import (
	"fmt"
	"strings"
)

type Feed struct {
	URL string
	Description string
	Tags []string
}

// Return the final feed string, depending on either the link has a description, tags or not
func (f Feed) String() string {
	var ff string
	var tags string

	if len(f.Tags) > 0 {
		for i := range f.Tags {
			tags += " " +  f.Tags[i]
		}
	}

	if f.Description == "" {
		ff = fmt.Sprintf("%s %s", f.URL, tags)
	} else {
		ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags)
	}

	return strings.TrimSpace(ff)
}


type Token struct {
	Type  string
	Value string
}

func (t Token) String() string {
	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
}

// Lexer embbed a temporary buffer to store "content"
// (url, description, tags) and an array of tokens
type Lexer struct {
	Buf    string
	Tokens []Token
}

func (l *Lexer) AddToken(t string, s string) {
	l.Tokens = append(l.Tokens, Token{t, s})
}

// Return last tokenized item, useful to determine context for a symbol
func (l Lexer) LastToken() Token {
	var lastToken Token
	if len(l.Tokens) > 0 {
		lastToken = l.Tokens[len(l.Tokens)-1]
	}
	return lastToken
}

func (l Lexer) CountToken(t string) int {
	var counter int
	for i := range l.Tokens {
		if l.Tokens[i].Value == t {
			counter ++
		}
	}
	return counter
}

func (l Lexer) IdentifyContent() string {
	var tokenType string
	if l.CountToken("[") == 2 {
		tokenType = "URL"
	}
	if l.CountToken("[") == 3 {
		tokenType = "DESC"
	} else {

	}
	return tokenType
}

// Tokenize a given line s from the org file
func (l Lexer) Process(s string) []Token {
	if strings.Count(s, "[") <= 0 {
		ss := strings.Split(s, " ")
		fmt.Println(ss)
		for w := range ss {
			if w == 0 {
				l.AddToken("URL", ss[w])
			} else {
				l.AddToken("TAG", ss[w])
			}
		}

		return l.Tokens
	}

	for i := range s {
		char := string(s[i])
		switch char {
		case "*":
			l.AddToken("HEADER", char)

		case "[":
			l.AddToken("OBRACKET", char)

		case "]":
			// non-empty buffer and closing bracket means
			// current state is out of "content" context and buffer can be tokenized
			if len(l.Buf) > 0 {
				var tokenType string = l.IdentifyContent()
				l.AddToken("CBRACKET", char)
				l.AddToken(tokenType, l.Buf)
				l.Buf = ""
			}

			// whitespaces have different meaning given the context :
			// Either separator or part of a content string
		case " ":
			lt := l.LastToken()
			if len(l.Buf) > 0 {
				l.AddToken(l.IdentifyContent(), l.Buf)
				l.Buf = ""
				l.AddToken("WHITESPACE", char)
			}

			if i > 0 {
				if string(s[i-1]) == " " {
					break
				}
			}

			if lt.Type != "WHITESPACE" {
				if len(l.Buf) == 0 {
					l.AddToken("WHITESPACE", char)
				} else {
					l.Buf += char
				}
			}

		default:
			l.Buf += char
		}
	}

	if len(l.Buf) > 0 {
		l.AddToken(l.IdentifyContent(), l.Buf)
	}

	return l.Tokens
}

// Only retrieve content tokens, ignores uneeded separators and brackets
func Parse(t []Token) Feed {
	var f Feed
	for i := range t {
		token := t[i]
		if token.Type == "URL" {
			f.URL = token.Value
		}

		if token.Type == "DESC" {
			f.Description = token.Value
		}

		if token.Type == "TAG" {
			f.Tags = append(f.Tags, token.Value)
		}
	}

	return f
}