Add Unicode Normalization to speech and braille (#16521)

Closes #16466 Summary of the issue: Several speech synthesizers and braille tables are unable to speak or braille some characters, such as ligatures ("ĳ") or decomposed characters (latin letters with a modifier to add acute, diaeresis, etc.). Also, italic or bold Unicode characters can't be spoken or brailled by default. Description of user facing changes None by default. If unicode normalization is enabled for speech, speech output for objects and text navigation is normalized. For braille, normalization is applied for all braille output. The reason for speech to apply normalization only for objects and text navigation is chosen on purpose, as for individual character navigation or text selection, we really want to pass the original character to the synthesizer. If we don't Unicode bold and italic characters are read as their normalized counterparts, which makes it impossible to distinguish them. This problem is less relevant when working with braille. Description of development approach Added UnicodeNormalizationOffsetConverter to textUtils with tests. It stores the normalized version of a given string and based on diffing, calculates offset mappings between the original and normalized strings. Processes output using UnicodeNormalizationOffsetConverter when translating braille with normalization on. The several mappings (braille to raw pos, raw to braille pos) are processed to account for normalization. Added normalization to getTextInfoSpeech and getPropertiesSpeech.
nvaccess · May 21, 2024 · 57ce236 · 57ce236
1 parent bec58a4
commit 57ce236
Show file tree

Hide file tree

Showing 8 changed files with 340 additions and 28 deletions.
diff --git a/source/braille.py b/source/braille.py
@@ -64,6 +64,7 @@
 import brailleViewer
 from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting
 from utils.security import objectBelowLockScreenAndWindowsIsLocked
+from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter
 import hwIo
 from editableText import EditableText
 
@@ -496,13 +497,40 @@ def update(self):
 		mode = louis.dotsIO
 		if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None:
 			mode |= louis.compbrlAtCursor
-		self.brailleCells, self.brailleToRawPos, self.rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
+
+		converter: UnicodeNormalizationOffsetConverter | None = None
+		if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(self.rawText):
+			converter = UnicodeNormalizationOffsetConverter(self.rawText)
+			textToTranslate = converter.encoded
+			# Typeforms must be adapted to represent normalized characters.
+			textToTranslateTypeforms = [
+				self.rawTextTypeforms[strOffset] for strOffset in converter.computedEncodedToStrOffsets
+			]
+			# Convert the cursor position to a normalized offset.
+			cursorPos = converter.strToEncodedOffsets(self.cursorPos)
+		else:
+			textToTranslate = self.rawText
+			textToTranslateTypeforms = self.rawTextTypeforms
+			cursorPos = self.cursorPos
+
+		self.brailleCells, brailleToRawPos, rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
 			[handler.table.fileName, "braille-patterns.cti"],
-			self.rawText,
-			typeform=self.rawTextTypeforms,
+			textToTranslate,
+			typeform=textToTranslateTypeforms,
 			mode=mode,
-			cursorPos=self.cursorPos
+			cursorPos=cursorPos
 		)
+
+		if converter:
+			# The received brailleToRawPos contains braille to normalized positions.
+			# Process them to represent real raw positions by converting them from normalized ones.
+			brailleToRawPos = [converter.encodedToStrOffsets(i) for i in brailleToRawPos]
+			# The received rawToBraillePos contains normalized to braille positions.
+			# Create a new list based on real raw positions.
+			rawToBraillePos = [rawToBraillePos[i] for i in converter.computedStrToEncodedOffsets]
+		self.brailleToRawPos = brailleToRawPos
+		self.rawToBraillePos = rawToBraillePos
+
 		if (
 			self.selectionStart is not None
 			and self.selectionEnd is not None

diff --git a/source/config/configSpec.py b/source/config/configSpec.py
@@ -35,6 +35,7 @@
 	# symbolLevel: One of the characterProcessing.SymbolLevel values.
 	symbolLevel = integer(default=100)
 	trustVoiceLanguage = boolean(default=true)
+	unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
 	includeCLDR = boolean(default=True)
 	beepSpeechModePitch = integer(default=10000,min=50,max=11025)
 	outputDevice = string(default=default)
@@ -82,6 +83,7 @@
 		optionsEnum="ReviewRoutingMovesSystemCaretFlag", behaviorOfDefault="NEVER")
 	readByParagraph = boolean(default=false)
 	wordWrap = boolean(default=true)
+	unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
 	focusContextPresentation = option("changedContext", "fill", "scroll", default="changedContext")
 	interruptSpeechWhileScrolling = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")
 	showSelection = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")

diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py
@@ -1589,6 +1589,17 @@ def makeSettings(self, settingsSizer):
 		self.bindHelpEvent("SpeechSettingsTrust", self.trustVoiceLanguageCheckbox)
 		self.trustVoiceLanguageCheckbox.SetValue(config.conf["speech"]["trustVoiceLanguage"])
 
+		self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = settingsSizerHelper.addLabeledControl(
+			labelText=_(
+				# Translators: This is a label for a combo-box in the Speech settings panel.
+				"Unicode normali&zation"
+			),
+			wxCtrlClass=nvdaControls.FeatureFlagCombo,
+			keyPath=["speech", "unicodeNormalization"],
+			conf=config.conf,
+		)
+		self.bindHelpEvent("SpeechUnicodeNormalization", self.unicodeNormalizationCombo)
+
 		includeCLDRText = _(
 			# Translators: This is the label for a checkbox in the
 			# voice settings panel (if checked, data from the unicode CLDR will be used
@@ -1701,6 +1712,7 @@ def onSave(self):
 			self.symbolLevelList.GetSelection()
 		].value
 		config.conf["speech"]["trustVoiceLanguage"] = self.trustVoiceLanguageCheckbox.IsChecked()
+		self.unicodeNormalizationCombo.saveCurrentValueToConf()
 		currentIncludeCLDR = config.conf["speech"]["includeCLDR"]
 		config.conf["speech"]["includeCLDR"] = newIncludeCldr = self.includeCLDRCheckbox.IsChecked()
 		if currentIncludeCLDR is not newIncludeCldr:
@@ -4145,6 +4157,17 @@ def makeSettings(self, settingsSizer):
 		self.bindHelpEvent("BrailleSettingsWordWrap", self.wordWrapCheckBox)
 		self.wordWrapCheckBox.Value = config.conf["braille"]["wordWrap"]
 
+		self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
+			labelText=_(
+				# Translators: This is a label for a combo-box in the Braille settings panel.
+				"Unicode normali&zation"
+			),
+			wxCtrlClass=nvdaControls.FeatureFlagCombo,
+			keyPath=["braille", "unicodeNormalization"],
+			conf=config.conf,
+		)
+		self.bindHelpEvent("BrailleUnicodeNormalization", self.unicodeNormalizationCombo)
+
 		self.brailleInterruptSpeechCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
 			labelText=_(
 				# Translators: This is a label for a combo-box in the Braille settings panel.
@@ -4184,6 +4207,7 @@ def onSave(self):
 		self.brailleReviewRoutingMovesSystemCaretCombo.saveCurrentValueToConf()
 		config.conf["braille"]["readByParagraph"] = self.readByParagraphCheckBox.Value
 		config.conf["braille"]["wordWrap"] = self.wordWrapCheckBox.Value
+		self.unicodeNormalizationCombo.saveCurrentValueToConf()
 		config.conf["braille"]["focusContextPresentation"] = self.focusContextPresentationValues[self.focusContextPresentationList.GetSelection()]
 		self.brailleInterruptSpeechCombo.saveCurrentValueToConf()
 		self.brailleShowSelectionCombo.saveCurrentValueToConf()

diff --git a/source/speech/speech.py b/source/speech/speech.py
@@ -25,6 +25,7 @@
 import speechDictHandler
 import characterProcessing
 import languageHandler
+from textUtils import unicodeNormalize
 from . import manager
 from .extensions import speechCanceled, pre_speechCanceled, pre_speech
 from .extensions import filter_speechSequence, speechCanceled
@@ -1568,6 +1569,8 @@ def getTextInfoSpeech(  # noqa: C901
 					# There was content after the indentation, so there is no more indentation.
 					indentationDone=True
 			if command:
+				if config.conf["speech"]["unicodeNormalization"]:
+					command = unicodeNormalize(command)
 				if inTextChunk:
 					relativeSpeechSequence[-1]+=command
 				else:
@@ -1775,7 +1778,7 @@ def getPropertiesSpeech(  # noqa: C901
 		reason: OutputReason = OutputReason.QUERY,
 		**propertyValues
 ) -> SpeechSequence:
-	textList: List[str] = []
+	textList: SpeechSequence = []
 	name: Optional[str] = propertyValues.get('name')
 	if name:
 		textList.append(name)
@@ -1968,7 +1971,11 @@ def getPropertiesSpeech(  # noqa: C901
 	errorMessage: str | None = propertyValues.get("errorMessage", None)
 	if errorMessage:
 		textList.append(errorMessage)
-
+	if config.conf["speech"]["unicodeNormalization"]:
+		textList = [
+			unicodeNormalize(t) if isinstance(t, str) else t
+			for t in textList
+		]
 	types.logBadSequenceTypes(textList)
 	return textList