Александър обнови решението на 29.11.2013 18:17 (преди над 4 години)
+package main
+
+import "regexp"
+import "strconv"
+
+type MarkdownParser struct {
+ content string
+}
+
+func NewMarkdownParser(text string) *MarkdownParser {
+ var mp *MarkdownParser = new(MarkdownParser)
+ mp.content = text
+ return mp
+}
+
+func (mp *MarkdownParser) Headers() []string {
+ result := []string{}
+
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headerSubmatch := headerRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for index := range headerSubmatch { //since we need only the header name and not the whole thing, use capturing groups
+ if headerSubmatch[index][2] != "" {
+ result = append(result, headerSubmatch[index][2])
+ } else if headerSubmatch[index][7] != "" {
+ result = append(result, headerSubmatch[index][7])
+ } else {
+ result = append(result, headerSubmatch[index][8])
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) SubHeadersOf(header string) []string {
+ result := []string{}
+
+ headers := mp.Headers()
+ var headerIndex int
+ for index := range headers {
+ if headers[index] == header {
+ headerIndex = index
+ }
+ }
+
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headersText := headerRegex.Split(mp.content, -1) //split the text according to headers
+
+ subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ subHeaderSubmatch := subHeaderRegex.FindAllStringSubmatch(headersText[headerIndex+1], -1) //find all the subHeaders of the given header
+
+ for index := range subHeaderSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
+ if subHeaderSubmatch[index][2] != "" {
+ result = append(result, subHeaderSubmatch[index][2])
+ } else if subHeaderSubmatch[index][7] != "" {
+ result = append(result, subHeaderSubmatch[index][7])
+ } else {
+ result = append(result, subHeaderSubmatch[index][8])
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Names() []string {
+ result := []string{}
+
+ namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
+ headersRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^###([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^#####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^######([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`) //Remove all headers as we are searching for names in the text
+ sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
+
+ textParts := headersRegex.Split(mp.content, -1)
+ for _, text := range textParts {
+ sentences := sentenceRegex.FindAllString(text+". ", -1) //close the text as a sentence if there isn't a dot, as it won't continue from one header to another
+ for _, sentence := range sentences {
+ names := namesRegex.FindAllStringSubmatch(sentence, -1)
+ for _, nameSubmatch := range names { //since we don't need the first found symbol and only the name, use capturing groups
+ result = append(result, nameSubmatch[1])
+ }
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) PhoneNumbers() []string {
+ result := []string{}
+
+ phoneRegex := regexp.MustCompile(`\s(\+?(([\t ]?\(((\d[\d\t \-]*\d)|(\d\d)|(\d))\)[\t ]?)|([\t ]?((\d[\d\t \-]*\d)|(\d\d)|(\d))[\t ]?))+)\s`)
+ phoneSubmatches := phoneRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for _, phoneSubmatch := range phoneSubmatches { //since we need only the phone number without the surrounding spaces, use capturing groups
+ result = append(result, phoneSubmatch[1])
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Links() []string {
+ result := []string{}
+
+ linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]=[-a-z0-9_])(&([-a-z0-9_]))*))?(#[-a-z0-9_]+)?)\)`)
+ linkSubmatches := linkRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for _, linkSubmatch := range linkSubmatches { //since we need only the link without the surrounding brackets and text, use capturing groups
+ result = append(result, linkSubmatch[1])
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Emails() []string {
+ emailRegex := regexp.MustCompile(`[a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))`)
+
+ return emailRegex.FindAllString(mp.content, -1)
+}
+
+func subHeadersOf(text string, level int, prefix string) string {
+ if level > 6 {
+ return ""
+ }
+
+ result := ""
+
+ regexString := `(?m)(^##`
+ for index := 3; index <= level; index++ {
+ regexString = regexString + `#`
+ }
+ regexString = regexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
+ subHeaderRegex := regexp.MustCompile(regexString)
+
+ subHeadersSubmatch := subHeaderRegex.FindAllStringSubmatch(text, -1)
+ subHeaders := []string{}
+
+ for index := range subHeadersSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
+ if subHeadersSubmatch[index][5] != "" {
+ subHeaders = append(subHeaders, subHeadersSubmatch[index][5])
+ } else {
+ subHeaders = append(subHeaders, subHeadersSubmatch[index][6])
+ }
+ }
+
+ subHeaderTexts := subHeaderRegex.Split(text, -1)
+ subHeaderTexts = subHeaderTexts[1:]
+
+ for index := range subHeaderTexts {
+ result = result + prefix + "." + strconv.Itoa(index+1) + " " + subHeaders[index] + "\n"
+ result = result + subHeadersOf(subHeaderTexts[index], level+1, prefix+"."+strconv.Itoa(index+1))
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) GenerateTableOfContents() string {
+ result := ""
+
+ subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#[^#](((.+?)#+)|(.+))$)`)
+ headers := mp.Headers()
+
+ for headerIndex, header := range headers {
+ result = result + strconv.Itoa(headerIndex+1) + ". " + header + "\n"
+
+ headersText := headerRegex.Split(mp.content, -1)
+ headersText = headersText[1:]
+
+ subHeaders := mp.SubHeadersOf(header)
+ subHeadersText := subHeaderRegex.Split(headersText[headerIndex], -1)
+
+ for subHeaderIndex, subHeader := range subHeaders {
+ result = result + strconv.Itoa(headerIndex+1) + "." + strconv.Itoa(subHeaderIndex+1) + " " + subHeader + "\n"
+ result = result + subHeadersOf(subHeadersText[subHeaderIndex+1], 3, strconv.Itoa(headerIndex+1)+"."+strconv.Itoa(subHeaderIndex+1))
+ }
+ }
+
+ return result
+}