-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: word count #118
fix: word count #118
Changes from all commits
eed7998
95d0838
f41393a
3ac1081
cf5ecb6
83b507c
2d89d4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import { Logs } from "@ubiquity-dao/ubiquibot-logger"; | ||
import configuration from "../configuration/config-reader"; | ||
|
||
const logger = new Logs("debug"); | ||
const logger = new Logs(configuration.logLevel); | ||
|
||
export default logger; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,8 @@ export class DataPurgeModule implements Module { | |
.replace(/^>.*$/gm, "") | ||
// Remove commands such as /start | ||
.replace(/^\/.+/g, "") | ||
// Remove HTML comments | ||
.replace(/<!--[\s\S]*?-->/g, "") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we use a virtual DOM creator like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Body comes in a text form. Then it gets transformed into MD -> HTML. So it is rendered as a text form. So typically when fetched from GitHub the body looks like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. HTML comments shouldn't be included in element.textContent is my point There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but there is no way to know it is a comment before converting it to HTML, since the MD renderer is ran first (it is actually the same for v1) so it is first converted to a |
||
// Keep only one new line needed by markdown-it package to convert to html | ||
.replace(/\n\s*\n/g, "\n") | ||
.trim(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,7 +108,9 @@ export class FormattingEvaluatorModule implements Module { | |
} | ||
|
||
_getFormattingScore(comment: GithubCommentScore) { | ||
const html = this._md.render(comment.content); | ||
// Change the \r to \n to fix markup interpretation | ||
const html = this._md.render(comment.content.replaceAll("\r", "\n")); | ||
logger.debug("Will analyze formatting for the current content", { comment: comment.content, html }); | ||
const temp = new JSDOM(html); | ||
if (temp.window.document.body) { | ||
const res = this.classifyTagsWithWordCount(temp.window.document.body, comment.type); | ||
|
@@ -118,7 +120,7 @@ export class FormattingEvaluatorModule implements Module { | |
} | ||
} | ||
|
||
_countWords(regexes: FormattingEvaluatorConfiguration["multipliers"][0]["rewards"]["regex"], text: string) { | ||
_countSymbols(regexes: FormattingEvaluatorConfiguration["multipliers"][0]["rewards"]["regex"], text: string) { | ||
const counts: { [p: string]: { count: number; multiplier: number } } = {}; | ||
for (const [regex, multiplier] of Object.entries(regexes)) { | ||
const match = text.trim().match(new RegExp(regex, "g")); | ||
|
@@ -139,17 +141,36 @@ export class FormattingEvaluatorModule implements Module { | |
|
||
for (const element of elements) { | ||
const tagName = element.tagName.toLowerCase(); | ||
const wordCount = this._countWords(this._multipliers[commentType].regex, element.textContent || ""); | ||
// We cannot use textContent otherwise we would duplicate counts, so instead we extract text nodes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. textContent of the top level parent element will do the right thing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, will be part of #92 |
||
const textNodes = Array.from(element?.childNodes || []).filter((node) => node.nodeType === 3); | ||
const innerText = textNodes | ||
.map((node) => node.nodeValue?.trim()) | ||
.join(" ") | ||
.trim(); | ||
const symbols = this._countSymbols(this._multipliers[commentType].regex, innerText); | ||
let score = 0; | ||
if (this._multipliers[commentType]?.html[tagName] !== undefined) { | ||
score = this._multipliers[commentType].html[tagName]; | ||
} else { | ||
logger.error(`Could not find multiplier for comment [${commentType}], <${tagName}>`); | ||
} | ||
tagWordCount[tagName] = { | ||
symbols: wordCount, | ||
score, | ||
}; | ||
logger.debug("Tag content results", { tagName, symbols, text: element.textContent }); | ||
// If we already had that tag included in the result, merge them and update total count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose that for the statistics it might be interesting to count words per element but honestly its out of scope and doesn't add business value while complicating the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will be changed in #92 |
||
if (Object.keys(tagWordCount).includes(tagName)) { | ||
for (const [k, v] of Object.entries(symbols)) { | ||
if (Object.keys(tagWordCount[tagName].symbols).includes(k)) { | ||
tagWordCount[tagName].symbols[k] = { | ||
...tagWordCount[tagName].symbols[k], | ||
count: tagWordCount[tagName].symbols[k].count + v.count, | ||
}; | ||
} | ||
} | ||
} else { | ||
tagWordCount[tagName] = { | ||
symbols: symbols, | ||
score, | ||
}; | ||
} | ||
} | ||
|
||
return tagWordCount; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't the default be errors?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is nice to get
info
within the logs by default because some interesting info is there during the process.