Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor fix #201

Merged
merged 4 commits into from
Nov 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/kiwi/TypoTransformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ namespace kiwi
continualTypoSet,
basicTypoSetWithContinual,
lengtheningTypoSet,
basicTypoSetWithContinualAndLengthening,
};

/**
Expand Down
4 changes: 3 additions & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ enum
KIWI_MATCH_EMAIL = 2,
KIWI_MATCH_HASHTAG = 4,
KIWI_MATCH_MENTION = 8,
KIWI_MATCH_SERIAL = 16,

KIWI_MATCH_NORMALIZE_CODA = 1 << 16,
KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17,
Expand All @@ -139,7 +140,7 @@ enum
KIWI_MATCH_SPLIT_SAISIOT = 1 << 25,
KIWI_MATCH_MERGE_SAISIOT = 1 << 26,

KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA,
};

Expand Down Expand Up @@ -361,6 +362,7 @@ enum
KIWI_TYPO_CONTINUAL_TYPO_SET = 2,
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3,
KIWI_TYPO_LENGTHENING_TYPO_SET = 4,
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5,
};

/**
Expand Down
4 changes: 2 additions & 2 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,11 +1090,11 @@ size_t kiwi::splitByTrie(

if (!!(matchOptions & Match::zCoda) && zCodaFollowable && isHangulCoda(c) && (n + 1 >= str.size() || !isHangulSyllable(str[n + 1])))
{
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
}
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
{
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
}
zCodaFollowable = false;
zSiotFollowable = false;
Expand Down
3 changes: 3 additions & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,7 @@ namespace kiwi
morph.vowel = CondVowel::none;
morph.polar = CondPolarity::none;
morph.complex = 0;
morph.saisiot = 0;
morph.lmMorphemeId = getDefaultMorphemeId(s.tokenization[0].tag);
form.candidate[0] = &morph;
}
Expand All @@ -921,6 +922,7 @@ namespace kiwi
morph.vowel = CondVowel::none;
morph.polar = CondPolarity::none;
morph.complex = 0;
morph.saisiot = 0;
morph.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ s.tokenization.size() };
for (size_t i = 0; i < s.tokenization.size(); ++i)
{
Expand Down Expand Up @@ -949,6 +951,7 @@ namespace kiwi
cmorph.vowel = CondVowel::none;
cmorph.polar = CondPolarity::none;
cmorph.complex = 0;
cmorph.saisiot = 0;
cmorph.tag = t.tag;
cmorph.lmMorphemeId = getDefaultMorphemeId(t.tag);
foundMorph = &cmorph;
Expand Down
29 changes: 24 additions & 5 deletions src/PathEvaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,6 @@ namespace kiwi
for (auto& curMorph : cands)
{
if (splitComplex && curMorph->getCombined()->complex) continue;
if (splitSaisiot && curMorph->getCombined()->saisiot) continue;
if (blocklist && blocklist->count(curMorph->getCombined())) continue;

// 덧붙은 받침(zCoda)을 위한 지름길
Expand Down Expand Up @@ -1007,7 +1006,8 @@ namespace kiwi
const Vector<U16StringView>& ownFormList,
float typoCostWeight,
const Morpheme* morphFirst,
size_t langVocabSize)
size_t langVocabSize,
bool splitSaisiot)
{
Vector<const WordLL<LmState>*> steps;
for (auto s = result->parent; s->parent; s = s->parent)
Expand All @@ -1029,13 +1029,32 @@ namespace kiwi
float scoreDiff = cur->accScore - prev->accScore;
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
auto morpheme = cur->morpheme;
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
const size_t numNewTokens = (splitSaisiot && morpheme->saisiot) || !(morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
? morpheme->chunks.size() : 1;
auto& gNode = graph[csearcher(cur)];
scoreDiff += typoCostDiff * typoCostWeight;
scoreDiff /= numNewTokens;
typoCostDiff /= numNewTokens;

if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
if (splitSaisiot && morpheme->saisiot)
{
for (size_t ch = 0; ch < numNewTokens; ++ch)
{
auto& p = morpheme->chunks.getSecond(ch);
ret.emplace_back(
unifyMorpheme(morpheme->chunks[ch]),
KString{},
gNode.startPos + p.first,
gNode.startPos + p.second,
scoreDiff,
typoCostDiff,
typoCostDiff ? gNode.typoFormId : 0,
&gNode - graph
);
}
ret.back().end = gNode.endPos;
}
else if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
{
ret.emplace_back(
unifyMorpheme(morpheme),
Expand Down Expand Up @@ -1274,7 +1293,7 @@ namespace kiwi
{
auto tokens = generateTokenList(
&cand[i], csearcher, graph, ownFormList, kw->typoCostWeight,
kw->morphemes.data(), langVocabSize
kw->morphemes.data(), langVocabSize, splitSaisiot
);
ret.emplace_back(move(tokens), cand[i].accScore, uniqStates[cand[i].rootId], cand[i].spState);
}
Expand Down
4 changes: 4 additions & 0 deletions src/TypoTransformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,8 @@ namespace kiwi

static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.25f);

static const TypoTransformer basicTypoSetWithContinualAndLengthening = basicTypoSetWithContinual | lengtheningTypoSet;

switch (set)
{
case kiwi::DefaultTypoSet::withoutTypo:
Expand All @@ -674,6 +676,8 @@ namespace kiwi
return basicTypoSetWithContinual;
case kiwi::DefaultTypoSet::lengtheningTypoSet:
return lengtheningTypoSet;
case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening:
return basicTypoSetWithContinualAndLengthening;
default:
throw invalid_argument{ "Invalid `DefaultTypoSet`" };
}
Expand Down
Loading