org2newsboat/tokenizer.go

package main

import (
	"fmt"
	"regexp"
	"strings"
)

type Feed struct {
	URL string
	Description string
	Tags []string
}

// Return the final feed string, depending on either the link has a description, tags or not
func (f Feed) String() string {
	var ff string
	var tags string = strings.TrimSpace(strings.Join(f.Tags, " "))

	ff = fmt.Sprintf("%s %s", f.URL, tags)

	if f.Description != "" {
		ff = fmt.Sprintf("%s # %s", strings.TrimSpace(ff), f.Description)
	}

	return strings.TrimSpace(ff)
}


type Token struct {
	Type  string
	Value string
}

func (t Token) String() string {
	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
}

// Lexer embbed a temporary buffer to store "content"
// (url, description, tags) and an array of tokens
type Lexer struct {
	Buf    string
	Tokens []Token
}

func (l *Lexer) AddToken(t string, s string) {
	l.Tokens = append(l.Tokens, Token{t, s})
}

// Return last tokenized item, useful to determine context for a symbol
func (l Lexer) LastToken() Token {
	var lastToken Token
	if len(l.Tokens) > 0 {
		lastToken = l.Tokens[len(l.Tokens)-1]
	}
	return lastToken
}

func (l Lexer) CountToken(t string) int {
	var counter int
	for i := range l.Tokens {
		if l.Tokens[i].Value == t {
			counter ++
		}
	}
	return counter
}

func (l *Lexer) ProcessSimpleLink(s string) {
	ss := strings.Split(s[3:], " ")
	for w := range ss {
		if w == 0 {
			url := strings.TrimSpace(ss[w])
			l.AddToken("URL", url)
		} else {
			if ss[w] != "" && ss[w] != " " {
				tag := strings.ReplaceAll(ss[w], ":", "")
				l.AddToken("TAG", tag)
			}
		}
	}
}

// Tokenize a given line s from the org file
func (l Lexer) Process(s string) []Token {
	if s[3] != '[' {
		l.ProcessSimpleLink(s)
		return l.Tokens
	}

	re := regexp.MustCompile(`(?:\[\[)(?P<url>\S+)(?:\]\[)(?P<desc>.+)(?:\]\])(?P<tags>.+)?`)
	matches := re.FindStringSubmatch(s)
	if len(matches) > 1 {
		l.AddToken("URL", strings.TrimSpace(matches[1]))
	}
	if len(matches) > 2 {
		l.AddToken("DESC", strings.TrimSpace(matches[2]))
	}
	if len(matches) > 3 {
		tags := strings.Split(matches[3], " ")
		for t := range tags {
			if tags[t] != "" && tags[t] != " " {
				tag := strings.ReplaceAll(tags[t], ":", "")
				l.AddToken("TAG", strings.TrimSpace(tag))
			}
		}
	}

	return l.Tokens
}

func Parse(t []Token) Feed {
	var f Feed
	for i := range t {
		token := t[i]
		if token.Type == "URL" {
			f.URL = token.Value
		}

		if token.Type == "DESC" {
			f.Description = token.Value
		}

		if token.Type == "TAG" {
			f.Tags = append(f.Tags, token.Value)
		}
	}

	return f
}