From 35443a055b725c41c970138f1ed2270714d79cb9 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Wed, 11 Oct 2023 13:43:09 +0800 Subject: [PATCH 1/3] feat(graphql): apply confidence gate to OCR paragraphs --- src/graphql/__tests__/util.js | 4 +-- src/graphql/util.js | 57 ++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/src/graphql/__tests__/util.js b/src/graphql/__tests__/util.js index 33dbcd83..9be79e3b 100644 --- a/src/graphql/__tests__/util.js +++ b/src/graphql/__tests__/util.js @@ -137,7 +137,6 @@ if (process.env.GCS_BUCKET_NAME) { "status": "SUCCESS", "text": "排 汗 - 汗和排尿的差別 想要健康長壽就要想辦法一天一次大量排汗 德國醫學博士艾倫斯特發現:所有運動選手中 唯獨馬拉松選手沒有罹患癌症病例。 @@ -168,7 +167,8 @@ if (process.env.GCS_BUCKET_NAME) { 與自律神經。 藉著汗,氣化熱消耗熱量,能夠提升代謝力, 不但減少體脂肪,還有助於消除肥胖。 - 可以先從關掉冷氣做起", + 可以先從關掉冷氣做起 + ", "type": "TRANSCRIPT", "userId": "user-id", } diff --git a/src/graphql/util.js b/src/graphql/util.js index 7c1b569b..c61f807c 100644 --- a/src/graphql/util.js +++ b/src/graphql/util.js @@ -690,6 +690,46 @@ export function createAIResponse({ user, ...loadingResponseBody }) { } const imageAnnotator = new ImageAnnotatorClient(); +const OCR_CONFIDENCE_THRESHOLD = 0.85; + +/** + * @param {ITextAnnotation} fullTextAnnotation - The fullTextAnnotation returned by client.documentTextDetection + * @returns {string} The extracted text that is comprised of paragraphs passing OCR_CONFIDENCE_THRESHOLD + */ +function extractTextFromFullTextAnnotation(fullTextAnnotation) { + const { + pages: [{ blocks }], + } = fullTextAnnotation; + + // Hierarchy described in https://cloud.google.com/vision/docs/fulltext-annotations#annotating_an_image_using_document_text_ocr + // + return blocks + .flatMap(({ paragraphs }) => + paragraphs + .filter(({ confidence }) => confidence >= OCR_CONFIDENCE_THRESHOLD) + .flatMap(({ words }) => + words.flatMap(({ symbols }) => + symbols.map(({ text, property }) => { + if (!property || !property.detectedBreak) return text; + + // Word break type described in + // http://googleapis.github.io/googleapis/java/grpc-google-cloud-vision-v1/0.1.5/apidocs/com/google/cloud/vision/v1/TextAnnotation.DetectedBreak.BreakType.html#UNKNOWN + const breakStr = [ + 'EOL_SURE_SPACE', + 'HYPHEN', + 'LINE_BREAK', + ].includes(property.detectedBreak.type) + ? '\n' + : ' '; + return property.detectedBreak.isPrefix + ? `${breakStr}${text}` + : `${text}${breakStr}`; + }) + ) + ) + ) + .join(''); +} /** * @param {object} queryInfo - contains type and media entry ID of contents after fileUrl @@ -713,10 +753,25 @@ export async function createTranscript(queryInfo, fileUrl, user) { ] = await imageAnnotator.documentTextDetection(fileUrl); console.log('[createTranscript]', queryInfo.id, fullTextAnnotation); + + // This should not happen, but just in case + // + if ( + !fullTextAnnotation || + !fullTextAnnotation.pages || + fullTextAnnotation.pages.length === 0 + ) { + return update({ + status: 'SUCCESS', + // No text detected + text: '', + }); + } + return update({ status: 'SUCCESS', // Write '' if no text detected - text: fullTextAnnotation?.text ?? '', + text: extractTextFromFullTextAnnotation(fullTextAnnotation), }); } From 2f6bbbed06b29b0d685e352af1df0bcf2ba4f2a7 Mon Sep 17 00:00:00 2001 From: Johnson Liang Date: Thu, 12 Oct 2023 13:14:56 +0800 Subject: [PATCH 2/3] feat(graphql): adjust OCR_CONFIDENCE_THRESHOLD to 0.75 Per discussion in https://g0v.hackmd.io/t9ypB87SQBuMjjW_PheZVg#Revisiting-OCR-accuracy --- src/graphql/util.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graphql/util.js b/src/graphql/util.js index c61f807c..ac2ba3a2 100644 --- a/src/graphql/util.js +++ b/src/graphql/util.js @@ -690,7 +690,7 @@ export function createAIResponse({ user, ...loadingResponseBody }) { } const imageAnnotator = new ImageAnnotatorClient(); -const OCR_CONFIDENCE_THRESHOLD = 0.85; +const OCR_CONFIDENCE_THRESHOLD = 0.75; /** * @param {ITextAnnotation} fullTextAnnotation - The fullTextAnnotation returned by client.documentTextDetection From feef6096d371a187b291d6306a3fcd6942ea5adf Mon Sep 17 00:00:00 2001 From: MrOrz Date: Fri, 13 Oct 2023 01:21:24 +0800 Subject: [PATCH 3/3] fix: update snapshot for lower OCR confidence threshold --- src/graphql/__tests__/util.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graphql/__tests__/util.js b/src/graphql/__tests__/util.js index 9be79e3b..2d912765 100644 --- a/src/graphql/__tests__/util.js +++ b/src/graphql/__tests__/util.js @@ -137,6 +137,7 @@ if (process.env.GCS_BUCKET_NAME) { "status": "SUCCESS", "text": "排 汗 + 汗和排尿的差別 想要健康長壽就要想辦法一天一次大量排汗 德國醫學博士艾倫斯特發現:所有運動選手中 唯獨馬拉松選手沒有罹患癌症病例。