Решение на Markdown от Александър Димитров

Обратно към всички решения

Към профила на Александър Димитров

Резултати

  • 4 точки от тестове
  • 0 бонус точки
  • 4 точки общо
  • 3 успешни тест(а)
  • 4 неуспешни тест(а)

Код

package main
import (
"regexp"
"strconv"
)
type MarkdownParser struct {
content string
}
func NewMarkdownParser(text string) *MarkdownParser {
var mp *MarkdownParser = new(MarkdownParser)
mp.content = text
return mp
}
func (mp *MarkdownParser) Headers() []string {
result := []string{}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerSubmatch := headerRegex.FindAllStringSubmatch(mp.content, -1)
for index := range headerSubmatch { //since we need only the header name and not the whole thing, use capturing groups
if headerSubmatch[index][2] != "" {
result = append(result, headerSubmatch[index][2])
} else if headerSubmatch[index][7] != "" {
result = append(result, headerSubmatch[index][7])
} else {
result = append(result, headerSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) SubHeadersOf(header string) []string {
result := []string{}
headers := mp.Headers()
var headerIndex int
for index := range headers {
if headers[index] == header {
headerIndex = index
}
}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headersText := headerRegex.Split(mp.content, -1) //split the text according to headers
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
subHeaderSubmatch := subHeaderRegex.FindAllStringSubmatch(headersText[headerIndex+1], -1) //find all the subHeaders of the given header
for index := range subHeaderSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeaderSubmatch[index][2] != "" {
result = append(result, subHeaderSubmatch[index][2])
} else if subHeaderSubmatch[index][7] != "" {
result = append(result, subHeaderSubmatch[index][7])
} else {
result = append(result, subHeaderSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) Names() []string {
result := []string{}
headersRegexString := `(?m)(^([^=\s].+)$\n^=+$)|(^([^-\s].+)$\n^-+$)`
for _, headerIndex := range []int{1, 2, 3, 4, 5, 6} {
headersRegexString = headersRegexString + `|(^`
for i := 0; i < headerIndex; i++ {
headersRegexString = headersRegexString + `#`
}
headersRegexString = headersRegexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
}
namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
headersRegex := regexp.MustCompile(headersRegexString) //Remove all headers as we are searching for names in the text
sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
textParts := headersRegex.Split(mp.content, -1)
for _, text := range textParts {
sentences := sentenceRegex.FindAllString(text+". ", -1) //close the text as a sentence if there isn't a dot, as it won't continue from one header to another
for _, sentence := range sentences {
names := namesRegex.FindAllStringSubmatch(sentence, -1)
for _, nameSubmatch := range names { //since we don't need the first found symbol and only the name, use capturing groups
result = append(result, nameSubmatch[1])
}
}
}
return result
}
func (mp *MarkdownParser) PhoneNumbers() []string {
result := []string{}
phoneRegex := regexp.MustCompile(`\s(\+?(([\t ]?\(((\d[\d\t \-]*\d)|(\d\d)|(\d))\)[\t ]?)|([\t ]?((\d[\d\t \-]*\d)|(\d\d)|(\d))[\t ]?))+)\s`)
phoneSubmatches := phoneRegex.FindAllStringSubmatch(mp.content, -1)
for _, phoneSubmatch := range phoneSubmatches { //since we need only the phone number without the surrounding spaces, use capturing groups
result = append(result, phoneSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Links() []string {
result := []string{}
linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]+=[-a-z0-9_]+)(&([-a-z0-9_]+=[-a-z0-9_]+))*))?(#[-a-z0-9_]+)?)\)`)
linkSubmatches := linkRegex.FindAllStringSubmatch(mp.content, -1)
for _, linkSubmatch := range linkSubmatches { //since we need only the link without the surrounding brackets and text, use capturing groups
result = append(result, linkSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Emails() []string {
result := []string{}
emailRegex := regexp.MustCompile(`(?s)[\s,]([a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+)))`)
emailSubmatches := emailRegex.FindAllStringSubmatch(mp.content, -1)
for _, emailSubmatch := range emailSubmatches { //since we need only the email without the surrounding spaces, use capturing groups
result = append(result, emailSubmatch[1])
}
return result
}
func subHeadersOf(text string, level int, prefix string) string {
if level > 6 {
return ""
}
result := ""
regexString := `(?m)(^##`
for index := 3; index <= level; index++ {
regexString = regexString + `#`
}
regexString = regexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
subHeaderRegex := regexp.MustCompile(regexString)
subHeadersSubmatch := subHeaderRegex.FindAllStringSubmatch(text, -1)
subHeaders := []string{}
for index := range subHeadersSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeadersSubmatch[index][5] != "" {
subHeaders = append(subHeaders, subHeadersSubmatch[index][5])
} else {
subHeaders = append(subHeaders, subHeadersSubmatch[index][6])
}
}
subHeaderTexts := subHeaderRegex.Split(text, -1)
subHeaderTexts = subHeaderTexts[1:]
for index := range subHeaderTexts {
result = result + prefix + "." + strconv.Itoa(index+1) + " " + subHeaders[index] + "\n"
result = result + subHeadersOf(subHeaderTexts[index], level+1, prefix+"."+strconv.Itoa(index+1))
}
return result
}
func (mp *MarkdownParser) GenerateTableOfContents() string {
result := ""
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#[^#](((.+?)#+)|(.+))$)`)
headers := mp.Headers()
for headerIndex, header := range headers {
result = result + strconv.Itoa(headerIndex+1) + ". " + header + "\n"
headersText := headerRegex.Split(mp.content, -1)
headersText = headersText[1:]
subHeaders := mp.SubHeadersOf(header)
subHeadersText := subHeaderRegex.Split(headersText[headerIndex], -1)
for subHeaderIndex, subHeader := range subHeaders {
result = result + strconv.Itoa(headerIndex+1) + "." + strconv.Itoa(subHeaderIndex+1) + " " + subHeader + "\n"
result = result + subHeadersOf(subHeadersText[subHeaderIndex+1], 3, strconv.Itoa(headerIndex+1)+"."+strconv.Itoa(subHeaderIndex+1))
}
}
return result
}

Лог от изпълнението

PASS
ok  	_/tmp/d20140106-32701-1tgj8vm	0.012s
--- FAIL: TestSubHeadersOf (0.00 seconds)
	solution_test.go:56: Not equal:
		  []string{}
		  []string(nil)
FAIL
exit status 1
FAIL	_/tmp/d20140106-32701-1tgj8vm	0.013s
--- FAIL: TestNames (0.00 seconds)
	solution_test.go:72: Not equal:
		  []string{"Of Line", "Иван Петров"}
		  []string{"Of Line", "Иван Петров", "Mozilla Firefox"}
FAIL
exit status 1
FAIL	_/tmp/d20140106-32701-1tgj8vm	0.015s
--- FAIL: TestPhoneNumbers (0.00 seconds)
	solution_test.go:86: Not equal:
		  []string{"0889123456", "0 (889) 123"}
		  []string{"0889123456", "0 (889) 123", "456", "+45-(31)"}
FAIL
exit status 1
FAIL	_/tmp/d20140106-32701-1tgj8vm	0.012s
--- FAIL: TestLinks (0.00 seconds)
	solution_test.go:98: Not equal:
		  []string{}
		  []string{"http://somelink.com:230", "https://www.google.bg/search?q=4531&ie=utf-8&oe=utf-8&rls=org.mozilla:en-US:official&client=%20firefox-a&gws_rd=asd&ei=some#somefragment"}
FAIL
exit status 1
FAIL	_/tmp/d20140106-32701-1tgj8vm	0.012s
PASS
ok  	_/tmp/d20140106-32701-1tgj8vm	0.013s
PASS
ok  	_/tmp/d20140106-32701-1tgj8vm	0.015s

История (3 версии и 2 коментара)

Александър обнови решението на 29.11.2013 18:17 (преди над 4 години)

+package main
+
+import "regexp"
+import "strconv"
+
+type MarkdownParser struct {
+ content string
+}
+
+func NewMarkdownParser(text string) *MarkdownParser {
+ var mp *MarkdownParser = new(MarkdownParser)
+ mp.content = text
+ return mp
+}
+
+func (mp *MarkdownParser) Headers() []string {
+ result := []string{}
+
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headerSubmatch := headerRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for index := range headerSubmatch { //since we need only the header name and not the whole thing, use capturing groups
+ if headerSubmatch[index][2] != "" {
+ result = append(result, headerSubmatch[index][2])
+ } else if headerSubmatch[index][7] != "" {
+ result = append(result, headerSubmatch[index][7])
+ } else {
+ result = append(result, headerSubmatch[index][8])
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) SubHeadersOf(header string) []string {
+ result := []string{}
+
+ headers := mp.Headers()
+ var headerIndex int
+ for index := range headers {
+ if headers[index] == header {
+ headerIndex = index
+ }
+ }
+
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headersText := headerRegex.Split(mp.content, -1) //split the text according to headers
+
+ subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ subHeaderSubmatch := subHeaderRegex.FindAllStringSubmatch(headersText[headerIndex+1], -1) //find all the subHeaders of the given header
+
+ for index := range subHeaderSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
+ if subHeaderSubmatch[index][2] != "" {
+ result = append(result, subHeaderSubmatch[index][2])
+ } else if subHeaderSubmatch[index][7] != "" {
+ result = append(result, subHeaderSubmatch[index][7])
+ } else {
+ result = append(result, subHeaderSubmatch[index][8])
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Names() []string {
+ result := []string{}
+
+ namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
+ headersRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^###([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^#####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^######([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`) //Remove all headers as we are searching for names in the text
+ sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
+
+ textParts := headersRegex.Split(mp.content, -1)
+ for _, text := range textParts {
+ sentences := sentenceRegex.FindAllString(text+". ", -1) //close the text as a sentence if there isn't a dot, as it won't continue from one header to another
+ for _, sentence := range sentences {
+ names := namesRegex.FindAllStringSubmatch(sentence, -1)
+ for _, nameSubmatch := range names { //since we don't need the first found symbol and only the name, use capturing groups
+ result = append(result, nameSubmatch[1])
+ }
+ }
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) PhoneNumbers() []string {
+ result := []string{}
+
+ phoneRegex := regexp.MustCompile(`\s(\+?(([\t ]?\(((\d[\d\t \-]*\d)|(\d\d)|(\d))\)[\t ]?)|([\t ]?((\d[\d\t \-]*\d)|(\d\d)|(\d))[\t ]?))+)\s`)
+ phoneSubmatches := phoneRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for _, phoneSubmatch := range phoneSubmatches { //since we need only the phone number without the surrounding spaces, use capturing groups
+ result = append(result, phoneSubmatch[1])
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Links() []string {
+ result := []string{}
+
+ linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]=[-a-z0-9_])(&([-a-z0-9_]))*))?(#[-a-z0-9_]+)?)\)`)
+ linkSubmatches := linkRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for _, linkSubmatch := range linkSubmatches { //since we need only the link without the surrounding brackets and text, use capturing groups
+ result = append(result, linkSubmatch[1])
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) Emails() []string {
+ emailRegex := regexp.MustCompile(`[a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))`)
+
+ return emailRegex.FindAllString(mp.content, -1)
+}
+
+func subHeadersOf(text string, level int, prefix string) string {
+ if level > 6 {
+ return ""
+ }
+
+ result := ""
+
+ regexString := `(?m)(^##`
+ for index := 3; index <= level; index++ {
+ regexString = regexString + `#`
+ }
+ regexString = regexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
+ subHeaderRegex := regexp.MustCompile(regexString)
+
+ subHeadersSubmatch := subHeaderRegex.FindAllStringSubmatch(text, -1)
+ subHeaders := []string{}
+
+ for index := range subHeadersSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
+ if subHeadersSubmatch[index][5] != "" {
+ subHeaders = append(subHeaders, subHeadersSubmatch[index][5])
+ } else {
+ subHeaders = append(subHeaders, subHeadersSubmatch[index][6])
+ }
+ }
+
+ subHeaderTexts := subHeaderRegex.Split(text, -1)
+ subHeaderTexts = subHeaderTexts[1:]
+
+ for index := range subHeaderTexts {
+ result = result + prefix + "." + strconv.Itoa(index+1) + " " + subHeaders[index] + "\n"
+ result = result + subHeadersOf(subHeaderTexts[index], level+1, prefix+"."+strconv.Itoa(index+1))
+ }
+
+ return result
+}
+
+func (mp *MarkdownParser) GenerateTableOfContents() string {
+ result := ""
+
+ subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
+ headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#[^#](((.+?)#+)|(.+))$)`)
+ headers := mp.Headers()
+
+ for headerIndex, header := range headers {
+ result = result + strconv.Itoa(headerIndex+1) + ". " + header + "\n"
+
+ headersText := headerRegex.Split(mp.content, -1)
+ headersText = headersText[1:]
+
+ subHeaders := mp.SubHeadersOf(header)
+ subHeadersText := subHeaderRegex.Split(headersText[headerIndex], -1)
+
+ for subHeaderIndex, subHeader := range subHeaders {
+ result = result + strconv.Itoa(headerIndex+1) + "." + strconv.Itoa(subHeaderIndex+1) + " " + subHeader + "\n"
+ result = result + subHeadersOf(subHeadersText[subHeaderIndex+1], 3, strconv.Itoa(headerIndex+1)+"."+strconv.Itoa(subHeaderIndex+1))
+ }
+ }
+
+ return result
+}

Александър обнови решението на 30.11.2013 10:21 (преди над 4 години)

package main
import "regexp"
import "strconv"
type MarkdownParser struct {
content string
}
func NewMarkdownParser(text string) *MarkdownParser {
var mp *MarkdownParser = new(MarkdownParser)
mp.content = text
return mp
}
func (mp *MarkdownParser) Headers() []string {
result := []string{}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerSubmatch := headerRegex.FindAllStringSubmatch(mp.content, -1)
for index := range headerSubmatch { //since we need only the header name and not the whole thing, use capturing groups
if headerSubmatch[index][2] != "" {
result = append(result, headerSubmatch[index][2])
} else if headerSubmatch[index][7] != "" {
result = append(result, headerSubmatch[index][7])
} else {
result = append(result, headerSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) SubHeadersOf(header string) []string {
result := []string{}
headers := mp.Headers()
var headerIndex int
for index := range headers {
if headers[index] == header {
headerIndex = index
}
}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headersText := headerRegex.Split(mp.content, -1) //split the text according to headers
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
subHeaderSubmatch := subHeaderRegex.FindAllStringSubmatch(headersText[headerIndex+1], -1) //find all the subHeaders of the given header
for index := range subHeaderSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeaderSubmatch[index][2] != "" {
result = append(result, subHeaderSubmatch[index][2])
} else if subHeaderSubmatch[index][7] != "" {
result = append(result, subHeaderSubmatch[index][7])
} else {
result = append(result, subHeaderSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) Names() []string {
result := []string{}
namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
headersRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^###([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^#####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^######([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`) //Remove all headers as we are searching for names in the text
sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
textParts := headersRegex.Split(mp.content, -1)
for _, text := range textParts {
sentences := sentenceRegex.FindAllString(text+". ", -1) //close the text as a sentence if there isn't a dot, as it won't continue from one header to another
for _, sentence := range sentences {
names := namesRegex.FindAllStringSubmatch(sentence, -1)
for _, nameSubmatch := range names { //since we don't need the first found symbol and only the name, use capturing groups
result = append(result, nameSubmatch[1])
}
}
}
return result
}
func (mp *MarkdownParser) PhoneNumbers() []string {
result := []string{}
phoneRegex := regexp.MustCompile(`\s(\+?(([\t ]?\(((\d[\d\t \-]*\d)|(\d\d)|(\d))\)[\t ]?)|([\t ]?((\d[\d\t \-]*\d)|(\d\d)|(\d))[\t ]?))+)\s`)
phoneSubmatches := phoneRegex.FindAllStringSubmatch(mp.content, -1)
for _, phoneSubmatch := range phoneSubmatches { //since we need only the phone number without the surrounding spaces, use capturing groups
result = append(result, phoneSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Links() []string {
result := []string{}
- linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]=[-a-z0-9_])(&([-a-z0-9_]))*))?(#[-a-z0-9_]+)?)\)`)
+ linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]+=[-a-z0-9_]+)(&([-a-z0-9_]+=[-a-z0-9_]+))*))?(#[-a-z0-9_]+)?)\)`)
linkSubmatches := linkRegex.FindAllStringSubmatch(mp.content, -1)
for _, linkSubmatch := range linkSubmatches { //since we need only the link without the surrounding brackets and text, use capturing groups
result = append(result, linkSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Emails() []string {
- emailRegex := regexp.MustCompile(`[a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))`)
+ result := []string{}
- return emailRegex.FindAllString(mp.content, -1)
+ emailRegex := regexp.MustCompile(`(?s)[\s,]([a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+)))`)
+ emailSubmatches := emailRegex.FindAllStringSubmatch(mp.content, -1)
+
+ for _, emailSubmatch := range emailSubmatches { //since we need only the email without the surrounding spaces, use capturing groups
+ result = append(result, emailSubmatch[1])
+ }
+
+ return result
}
func subHeadersOf(text string, level int, prefix string) string {
if level > 6 {
return ""
}
result := ""
regexString := `(?m)(^##`
for index := 3; index <= level; index++ {
regexString = regexString + `#`
}
regexString = regexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
subHeaderRegex := regexp.MustCompile(regexString)
subHeadersSubmatch := subHeaderRegex.FindAllStringSubmatch(text, -1)
subHeaders := []string{}
for index := range subHeadersSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeadersSubmatch[index][5] != "" {
subHeaders = append(subHeaders, subHeadersSubmatch[index][5])
} else {
subHeaders = append(subHeaders, subHeadersSubmatch[index][6])
}
}
subHeaderTexts := subHeaderRegex.Split(text, -1)
subHeaderTexts = subHeaderTexts[1:]
for index := range subHeaderTexts {
result = result + prefix + "." + strconv.Itoa(index+1) + " " + subHeaders[index] + "\n"
result = result + subHeadersOf(subHeaderTexts[index], level+1, prefix+"."+strconv.Itoa(index+1))
}
return result
}
func (mp *MarkdownParser) GenerateTableOfContents() string {
result := ""
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#[^#](((.+?)#+)|(.+))$)`)
headers := mp.Headers()
for headerIndex, header := range headers {
result = result + strconv.Itoa(headerIndex+1) + ". " + header + "\n"
headersText := headerRegex.Split(mp.content, -1)
headersText = headersText[1:]
subHeaders := mp.SubHeadersOf(header)
subHeadersText := subHeaderRegex.Split(headersText[headerIndex], -1)
for subHeaderIndex, subHeader := range subHeaders {
result = result + strconv.Itoa(headerIndex+1) + "." + strconv.Itoa(subHeaderIndex+1) + " " + subHeader + "\n"
result = result + subHeadersOf(subHeadersText[subHeaderIndex+1], 3, strconv.Itoa(headerIndex+1)+"."+strconv.Itoa(subHeaderIndex+1))
}
}
return result
}

На това домашно, незнайно защо, бях забил да го пиша 5 дена подред, мъчейки се да докарам имената като хората и в логичен път xD

Принципно реших, че тъй като условието на задачата няма да се променя, мога да направя груб copy & paste на регулярните изрази за H3 до H6, просто за да свърша със задачата, иначе и един for не би навредил, което ще кача след малко :)

Александър обнови решението на 02.12.2013 16:42 (преди над 4 години)

package main
-import "regexp"
-import "strconv"
+import (
+ "regexp"
+ "strconv"
+)
type MarkdownParser struct {
content string
}
func NewMarkdownParser(text string) *MarkdownParser {
var mp *MarkdownParser = new(MarkdownParser)
mp.content = text
return mp
}
func (mp *MarkdownParser) Headers() []string {
result := []string{}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerSubmatch := headerRegex.FindAllStringSubmatch(mp.content, -1)
for index := range headerSubmatch { //since we need only the header name and not the whole thing, use capturing groups
if headerSubmatch[index][2] != "" {
result = append(result, headerSubmatch[index][2])
} else if headerSubmatch[index][7] != "" {
result = append(result, headerSubmatch[index][7])
} else {
result = append(result, headerSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) SubHeadersOf(header string) []string {
result := []string{}
headers := mp.Headers()
var headerIndex int
for index := range headers {
if headers[index] == header {
headerIndex = index
}
}
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headersText := headerRegex.Split(mp.content, -1) //split the text according to headers
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
subHeaderSubmatch := subHeaderRegex.FindAllStringSubmatch(headersText[headerIndex+1], -1) //find all the subHeaders of the given header
for index := range subHeaderSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeaderSubmatch[index][2] != "" {
result = append(result, subHeaderSubmatch[index][2])
} else if subHeaderSubmatch[index][7] != "" {
result = append(result, subHeaderSubmatch[index][7])
} else {
result = append(result, subHeaderSubmatch[index][8])
}
}
return result
}
func (mp *MarkdownParser) Names() []string {
result := []string{}
- namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
- headersRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^###([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^#####([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)|(^######([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`) //Remove all headers as we are searching for names in the text
- sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
+ headersRegexString := `(?m)(^([^=\s].+)$\n^=+$)|(^([^-\s].+)$\n^-+$)`
+ for _, headerIndex := range []int{1, 2, 3, 4, 5, 6} {
+ headersRegexString = headersRegexString + `|(^`
+ for i := 0; i < headerIndex; i++ {
+ headersRegexString = headersRegexString + `#`
+ }
+ headersRegexString = headersRegexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
+ }
+
+ namesRegex := regexp.MustCompile(`[^\p{Lu}\p{Ll}]((\p{Lu}\p{Ll}*)(((\s?\-\s?)|\s)(\p{Lu}\p{Ll}*))+)`) //Find all names in a sentence
+ headersRegex := regexp.MustCompile(headersRegexString) //Remove all headers as we are searching for names in the text
+ sentenceRegex := regexp.MustCompile(`(?s)[\p{Lu}\d]([\p{Lu}\p{Ll}\s\d]+|(["-\-/:->@[-a{-~]+[\p{Lu}\p{Ll}\s\d]+["-\-/:->@[-a{-~]*)|([\.\?!]+[\p{Lu}\p{Ll}\d]+))+[\.!\?]\s`) //Get all sentences from the text
textParts := headersRegex.Split(mp.content, -1)
for _, text := range textParts {
sentences := sentenceRegex.FindAllString(text+". ", -1) //close the text as a sentence if there isn't a dot, as it won't continue from one header to another
for _, sentence := range sentences {
names := namesRegex.FindAllStringSubmatch(sentence, -1)
for _, nameSubmatch := range names { //since we don't need the first found symbol and only the name, use capturing groups
result = append(result, nameSubmatch[1])
}
}
}
return result
}
func (mp *MarkdownParser) PhoneNumbers() []string {
result := []string{}
phoneRegex := regexp.MustCompile(`\s(\+?(([\t ]?\(((\d[\d\t \-]*\d)|(\d\d)|(\d))\)[\t ]?)|([\t ]?((\d[\d\t \-]*\d)|(\d\d)|(\d))[\t ]?))+)\s`)
phoneSubmatches := phoneRegex.FindAllStringSubmatch(mp.content, -1)
for _, phoneSubmatch := range phoneSubmatches { //since we need only the phone number without the surrounding spaces, use capturing groups
result = append(result, phoneSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Links() []string {
result := []string{}
linkRegex := regexp.MustCompile(`(?i)\[.+\]\(([-\da-z\.]+://((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+))(:\d+)?/(([-a-z0-9_\.~%]/?)*)?(\?(([-a-z0-9_]+=[-a-z0-9_]+)(&([-a-z0-9_]+=[-a-z0-9_]+))*))?(#[-a-z0-9_]+)?)\)`)
linkSubmatches := linkRegex.FindAllStringSubmatch(mp.content, -1)
for _, linkSubmatch := range linkSubmatches { //since we need only the link without the surrounding brackets and text, use capturing groups
result = append(result, linkSubmatch[1])
}
return result
}
func (mp *MarkdownParser) Emails() []string {
result := []string{}
emailRegex := regexp.MustCompile(`(?s)[\s,]([a-zA-Z0-9][-a-zA-Z0-9_\+\.]{0,200}@((([a-z0-9][-a-z0-9]*?[a-z0-9]*)(\.([a-z0-9][-a-z0-9]*?[a-z0-9]*))*)|(\[[0-9a-f:]+\])|([0-9\.]+)))`)
emailSubmatches := emailRegex.FindAllStringSubmatch(mp.content, -1)
for _, emailSubmatch := range emailSubmatches { //since we need only the email without the surrounding spaces, use capturing groups
result = append(result, emailSubmatch[1])
}
return result
}
func subHeadersOf(text string, level int, prefix string) string {
if level > 6 {
return ""
}
result := ""
regexString := `(?m)(^##`
for index := 3; index <= level; index++ {
regexString = regexString + `#`
}
regexString = regexString + `([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`
subHeaderRegex := regexp.MustCompile(regexString)
subHeadersSubmatch := subHeaderRegex.FindAllStringSubmatch(text, -1)
subHeaders := []string{}
for index := range subHeadersSubmatch { //since we need only the subHeader name and not the whole thing, use capturing groups
if subHeadersSubmatch[index][5] != "" {
subHeaders = append(subHeaders, subHeadersSubmatch[index][5])
} else {
subHeaders = append(subHeaders, subHeadersSubmatch[index][6])
}
}
subHeaderTexts := subHeaderRegex.Split(text, -1)
subHeaderTexts = subHeaderTexts[1:]
for index := range subHeaderTexts {
result = result + prefix + "." + strconv.Itoa(index+1) + " " + subHeaders[index] + "\n"
result = result + subHeadersOf(subHeaderTexts[index], level+1, prefix+"."+strconv.Itoa(index+1))
}
return result
}
func (mp *MarkdownParser) GenerateTableOfContents() string {
result := ""
subHeaderRegex := regexp.MustCompile(`(?m)(^([^-\s].+)$\n^-+$)|(^##([\t ]*((([^#\n].+?)#+)|([^#\n].+)))$)`)
headerRegex := regexp.MustCompile(`(?m)(^([^=\s].+)$\n^=+$)|(^#[^#](((.+?)#+)|(.+))$)`)
headers := mp.Headers()
for headerIndex, header := range headers {
result = result + strconv.Itoa(headerIndex+1) + ". " + header + "\n"
headersText := headerRegex.Split(mp.content, -1)
headersText = headersText[1:]
subHeaders := mp.SubHeadersOf(header)
subHeadersText := subHeaderRegex.Split(headersText[headerIndex], -1)
for subHeaderIndex, subHeader := range subHeaders {
result = result + strconv.Itoa(headerIndex+1) + "." + strconv.Itoa(subHeaderIndex+1) + " " + subHeader + "\n"
result = result + subHeadersOf(subHeadersText[subHeaderIndex+1], 3, strconv.Itoa(headerIndex+1)+"."+strconv.Itoa(subHeaderIndex+1))
}
}
return result
}