Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (

require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dlclark/regexp2 v1.7.0 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.0 // indirect
Expand All @@ -25,11 +26,14 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.0.6 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/stretchr/testify v1.9.0 // indirect
github.com/ugorji/go/codec v1.2.8 // indirect
golang.org/x/crypto v0.4.0 // indirect
golang.org/x/net v0.7.0 // indirect
golang.org/x/sys v0.5.0 // indirect
golang.org/x/text v0.7.0 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/ugorji/go/codec v1.2.8 h1:sgBJS6COt0b/P40VouWKdseidkDgHxYGm0SAglUHfP0=
github.com/ugorji/go/codec v1.2.8/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
Expand Down
11 changes: 2 additions & 9 deletions src/routes/question.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F
if err != nil {
return question, err
}
question.Body = template.HTML(processHTMLBody(questionBodyParentHTML))
question.Body = template.HTML(utils.ProcessHTMLBody(questionBodyParentHTML))

// Extract the shortened body description.
shortenedBody := strings.TrimSpace(questionBodyParent.Text())
Expand Down Expand Up @@ -245,7 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
answerBodyHTML, _ := answerBody.Html()

// Process code blocks within the answer.
processedAnswerBody := processHTMLBody(answerBodyHTML)
processedAnswerBody := utils.ProcessHTMLBody(answerBodyHTML)
answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))

// Extract author information and timestamp.
Expand All @@ -257,13 +257,6 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
return answers, nil
}

// processHTMLBody highlights syntax and replaces images with proxied versions.
func processHTMLBody(bodyHTML string) string {
highlightedBody := utils.HighlightCodeBlocks(bodyHTML)
imageProxiedBody := utils.ReplaceImgTags(highlightedBody)
return imageProxiedBody
}

// extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.
// It directly mutates the answer.
func extractAnswerAuthorInfo(selection *goquery.Selection, answer *types.FilteredAnswer, domain string) {
Expand Down
2 changes: 1 addition & 1 deletion src/utils/comments.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func FindAndReturnComments(inHtml, domain string, postLayout *goquery.Selection)
commentTimestamp := commentBody.Find("span.relativetime-clean").Text()

newFilteredComment := types.FilteredComment{
Text: template.HTML(commentCopy),
Text: template.HTML(ProcessHTMLBody(commentCopy)),
Timestamp: commentTimestamp,
AuthorName: commentAuthor.Text(),
AuthorURL: commentAuthorURL,
Expand Down
45 changes: 45 additions & 0 deletions src/utils/links.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package utils

import (
"net/url"
"regexp"
"strings"
)

// stackOverflowLinkQualifierRegex matches all anchor elements that meet the following conditions:
// * must be an anchor element
// * the anchor element must have a pathname beginning with /q or /questions
// * if there is a host, it must be stackoverflow.com or a subdomain
var stackOverflowLinkQualifierRegex = regexp.MustCompile(`<a\s[^>]*href="(?:https?://(?:www\.)?(?:\w+\.)*(?:stackoverflow|stackexchange)\.com)?/(?:q|questions)/[^"]*"[^>]*>.*?</a>`)

func ReplaceStackOverflowLinks(html string) string {
return stackOverflowLinkQualifierRegex.ReplaceAllStringFunc(html, func(match string) string {
// Extract the href attribute value from the anchor tag
hrefRegex := regexp.MustCompile(`href="([^"]*)"`)
hrefMatch := hrefRegex.FindStringSubmatch(match)
if len(hrefMatch) < 2 {
return match
}
href := hrefMatch[1]

// Parse the URL
url, err := url.Parse(href)
if err != nil {
return match
}

newUrl := url.String()

// Check if the host is a subdomain
parts := strings.Split(url.Host, ".")
if len(parts) > 2 {
// Prepend the subdomain to the path
url.Path = "/exchange/" + parts[0] + url.Path
}

newUrl = url.Path + url.RawQuery + url.Fragment

// Replace the href attribute value in the anchor tag
return strings.Replace(match, hrefMatch[1], newUrl, 1)
})
}
49 changes: 49 additions & 0 deletions src/utils/links_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package utils

import (
"fmt"
"github.com/stretchr/testify/assert"
"strings"
"testing"
)

var sampleInput = `<div class="d-flex fd-column fw-nowrap">
<div class="d-flex fw-nowrap">
<div class="flex--item wmn0 fl1 lh-lg">
<div class="flex--item fl1 lh-lg">
<div>
<b>This question already has answers here</b>:

</div>
</div>
</div>
</div>
<div class="flex--item mb0 mt4">
<a href="/questions/55083952/is-it-possible-to-populate-a-large-set-at-compile-time" dir="ltr">Is it possible to populate a large set at compile time?</a>
<span class="question-originals-answer-count">
(3 answers)
</span>
</div>
<div class="flex--item mb0 mt4">
<a href="https://stackoverflow.com/questions/27221504/how-can-you-make-a-safe-static-singleton-in-rust" dir="ltr">How can you make a safe static singleton in Rust?</a>
<span class="question-originals-answer-count">
(5 answers)
</span>
</div>
<div class="flex--item mb0 mt4">
<a href="https://security.stackexchange.com/questions/25371/brute-force-an-ssh-login-that-has-only-a-4-letter-password" dir="ltr">Brute-force an SSH-login that has only a 4-letter password</a>
<span class="question-originals-answer-count">
(9 answers)
</span>
</div>
<div class="flex--item mb0 mt8">Closed <span title="2020-01-29 14:28:42Z" class="relativetime">4 years ago</span>.</div>
</div>`

func TestReplaceStackOverflowLinks(t *testing.T) {
replacedLinks := ReplaceStackOverflowLinks(sampleInput)

fmt.Println(replacedLinks)

assert.False(t, strings.Contains(replacedLinks, "stackoverflow.com"))
assert.False(t, strings.Contains(replacedLinks, "stackexchange.com"))
}
9 changes: 9 additions & 0 deletions src/utils/process.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package utils

// ProcessHTMLBody runs HTML through the various preparation functions.
func ProcessHTMLBody(bodyHTML string) string {
highlightedBody := HighlightCodeBlocks(bodyHTML)
imageProxiedBody := ReplaceImgTags(highlightedBody)
stackOverflowLinksReplacedBody := ReplaceStackOverflowLinks(imageProxiedBody)
return stackOverflowLinksReplacedBody
}