Недялко обнови решението на 03.12.2013 04:49 (преди над 4 години)
+package main
+
+import (
+ //"fmt"
+ //"io/ioutil"
+ "regexp"
+ "strconv"
+)
+
+// A stateless parser implementation that's pretty unoptimized but somewhat flexible
+//NOTE: there probably are some of off-by-one errors, too sleepy to test though...
+
+type MarkdownParser struct {
+ text string
+}
+
+type Element struct {
+ ElementStart, ElementEnd int
+ TextStart, TextEnd int
+ MatchedPatterns []*regexp.Regexp
+}
+
+func getHeaderRegexPatterns(level int) []*regexp.Regexp {
+ //TODO: directly cache the compiled regexes: it's a bit wasteful to compute the them every time...
+ if level < 1 || level > 6 {
+ panic("OMG")
+ }
+
+ patterns := make([]*regexp.Regexp, 2)
+ patterns[0] = regexp.MustCompile(`(?m)^#{` + strconv.Itoa(level) + `}[ \t]*([^#].*?)[ \t]*#*\n+`)
+ if level > 2 {
+ return patterns[:1]
+ }
+
+ if level == 1 {
+ patterns[1] = regexp.MustCompile(`(?m)^(.+)[ \t]*\n=+[ \t]*\n+`)
+ } else if level == 2 {
+ patterns[1] = regexp.MustCompile(`(?m)^(.+)[ \t]*\n-+[ \t]*\n+`)
+ }
+
+ return patterns
+}
+
+func (mp *MarkdownParser) getText(from, to int) string {
+ if to < 0 || to > len(mp.text) {
+ to = len(mp.text)
+ }
+ if from > to {
+ return ""
+ }
+
+ return mp.text[from:to]
+}
+
+func (mp *MarkdownParser) getFirstElement(patterns []*regexp.Regexp, from, to int) (bool, Element) {
+ var minResult, currentResult []int
+ for _, re := range patterns {
+ if re.NumSubexp() != 1 {
+ panic("Invalid regex was supplied")
+ }
+
+ currentResult = re.FindStringSubmatchIndex(mp.getText(from, to))
+
+ if len(minResult) == 0 || (len(currentResult) > 0 && (currentResult[0] < minResult[0])) {
+ minResult = currentResult
+ }
+ }
+
+ if len(minResult) == 0 {
+ return false, Element{}
+ }
+
+ return true, Element{minResult[0] + from, minResult[1] + from, minResult[2] + from, minResult[3] + from, patterns}
+}
+
+func (mp *MarkdownParser) findFirstElementByContent(patterns []*regexp.Regexp, content string, from, to int) (bool, Element) {
+ var currentFrom int = from
+
+ for {
+ found, elPos := mp.getFirstElement(patterns, currentFrom, to)
+ if !found {
+ return false, Element{}
+ }
+
+ if mp.getText(elPos.TextStart, elPos.TextEnd) == content {
+ return true, elPos
+ }
+
+ currentFrom = elPos.ElementEnd
+ }
+}
+
+func (mp *MarkdownParser) getElementsText(patterns []*regexp.Regexp, from, to int) []string {
+ var result []string = nil
+ var currentFrom int = from
+
+ for {
+ found, elPos := mp.getFirstElement(patterns, currentFrom, to)
+ if !found {
+ break
+ }
+
+ result = append(result, mp.getText(elPos.TextStart, elPos.TextEnd))
+ currentFrom = elPos.ElementEnd
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) getMatchedText(pattern string, matchPosition int) []string {
+ var result []string = nil
+
+ re := regexp.MustCompile(pattern)
+
+ for _, match := range re.FindAllStringSubmatch(mp.text, -1) {
+ if len(match) >= matchPosition {
+ result = append(result, match[matchPosition])
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) getChildrenOf(el Element, patterns []*regexp.Regexp) []Element {
+ var result []Element
+ var currentFrom int = el.ElementEnd
+ var currentElementEnd int = -1
+
+ if el.ElementEnd > 0 {
+ foundNextSibling, nextSiblingPos := mp.getFirstElement(el.MatchedPatterns, el.ElementEnd, -1)
+ if foundNextSibling {
+ currentElementEnd = nextSiblingPos.ElementStart
+ }
+ }
+
+ for {
+ found, elPos := mp.getFirstElement(patterns, currentFrom, currentElementEnd)
+ if !found {
+ break
+ }
+
+ result = append(result, elPos)
+ currentFrom = elPos.ElementEnd
+ }
+
+ //fmt.Printf("mdp.getChildrenOf(): %v\n\n\n", result)
+
+ return result
+}
+
+func (mp *MarkdownParser) getContentsOf(el Element, prefix string, level int) string {
+ result := "" //TODO: use a string builder
+ for i, subEl := range mp.getChildrenOf(el, getHeaderRegexPatterns(level)) {
+
+ newPrefix := prefix + "." + strconv.Itoa(i+1)
+ result += newPrefix + " " + mp.getText(subEl.TextStart, subEl.TextEnd) + "\n"
+
+ if level < 6 {
+ subresult := mp.getContentsOf(subEl, newPrefix, level+1)
+ if subresult != "" {
+ result += subresult
+ }
+ }
+ }
+ return result
+}
+
+// The required methods:
+
+func NewMarkdownParser(text string) *MarkdownParser {
+ result := new(MarkdownParser)
+ result.text = text
+ return result
+}
+
+func (mp *MarkdownParser) Headers() []string {
+ return mp.getElementsText(getHeaderRegexPatterns(1), 0, -1)
+}
+
+func (mp *MarkdownParser) SubHeadersOf(header string) []string {
+ var result []string
+
+ found, h1 := mp.findFirstElementByContent(getHeaderRegexPatterns(1), header, 0, -1)
+ if !found {
+ return nil
+ }
+
+ for _, h2 := range mp.getChildrenOf(h1, getHeaderRegexPatterns(2)) {
+ result = append(result, mp.getText(h2.TextStart, h2.TextEnd))
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Names() []string {
+ return mp.getMatchedText(`[^\.\n"'\s][ \t]*(\p{Lu}\p{Ll}*([ -]+\p{Lu}\p{Ll}*)+)`, 1)
+}
+
+func (mp *MarkdownParser) PhoneNumbers() []string {
+ return mp.getMatchedText(`[^\d\pL\w-]([+\(]?\d[\d\(\)\- ]*\d)`, 1)
+}
+
+func (mp *MarkdownParser) Links() []string {
+ return mp.getMatchedText(`(https?:\/\/[^\s]+)`, 1)
+}
+func (mp *MarkdownParser) Emails() []string {
+ return mp.getMatchedText(`[\s\n^]([a-zA-Z0-9][a-zA-Z0-9_\+\.\-]{0,200}@(?:[a-zA-Z0-9]+(?:\-*[a-zA-Z0-9])*\.)+[a-zA-Z]{2,6})`, 1)
+}
+
+func (mp *MarkdownParser) GenerateTableOfContents() string {
+ //RANT: code duplication is neccasary because of the stupid rule that only H1 numbers should end with "." in the TOC...
+ result := "" //TODO: use a string builder
+ for i, subEl := range mp.getChildrenOf(Element{}, getHeaderRegexPatterns(1)) {
+
+ prefix := strconv.Itoa(i + 1)
+ result += prefix + ". " + mp.getText(subEl.TextStart, subEl.TextEnd) + "\n"
+ subresult := mp.getContentsOf(subEl, prefix, 2)
+ if subresult != "" {
+ result += subresult
+ }
+ }
+ return result
+}
+
+/*
+func main() {
+ content, err := ioutil.ReadFile("./README.md")
+ if err != nil {
+ return
+ }
+
+ mdp := NewMarkdownParser(string(content))
+
+ //fmt.Printf("mdp.Headers(): %v\n\n\n", mdp.Headers())
+ //fmt.Printf("mdp.SubHeadersOf('MarkdownParser'): %v\n\n\n", mdp.SubHeadersOf("MarkdownParser"))
+ //fmt.Println(mdp.GenerateTableOfContents())
+
+ //fmt.Printf("mdp.Names(): %#v\n\n\n", mdp.Names())
+ //fmt.Printf("mdp.PhoneNumbers(): %#v\n\n\n", mdp.PhoneNumbers())
+ //fmt.Printf("mdp.PhoneNumbers(): %#v\n\n\n", mdp.PhoneNumbers())
+ //fmt.Printf("mdp.Links(): %#v\n\n\n", mdp.Links())
+ fmt.Printf("mdp.Emails(): %#v\n\n\n", mdp.Emails())
+
+ return
+}
+*/