From 3f853c6db8f3406a09ff161311334b1b40ff1f7b Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Wed, 28 Aug 2024 23:02:26 +0000 Subject: [PATCH] Unify code --- searchsummary/src/vespa/juniper/sumdesc.cpp | 260 ++++++++------------ 1 file changed, 99 insertions(+), 161 deletions(-) diff --git a/searchsummary/src/vespa/juniper/sumdesc.cpp b/searchsummary/src/vespa/juniper/sumdesc.cpp index 66125bdf44fe..2e62af9be478 100644 --- a/searchsummary/src/vespa/juniper/sumdesc.cpp +++ b/searchsummary/src/vespa/juniper/sumdesc.cpp @@ -23,19 +23,16 @@ using namespace juniper::separators; namespace { -static constexpr char replacement_char = '.'; +constexpr char replacement_char = '.'; -char printable_char(char c) -{ +char +printable_char(char c) { unsigned char uc = (unsigned char) c; - if (uc >= 0x80 || uc < (unsigned char) ' ') { - return replacement_char; - } - return c; + return (uc >= 0x80 || uc < (unsigned char) ' ') ? replacement_char : c; } -bool wordchar(const unsigned char* s) -{ +bool +wordchar(const unsigned char* s) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); @@ -45,30 +42,29 @@ bool wordchar(const unsigned char* s) } } -bool wordchar_or_il_ann_char(const unsigned char* s, char32_t annotation_char) -{ +bool +wordchar_or_il_ann_char(const unsigned char* s, char32_t annotation_char) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); - return Fast_UnicodeUtil::IsWordChar(u) || - static_cast(u) == annotation_char; + return Fast_UnicodeUtil::IsWordChar(u) || static_cast(u) == annotation_char; } else { return std::isalnum(c); } } -bool wordchar_or_il_ann_anchor(const unsigned char* s) -{ +bool +wordchar_or_il_ann_anchor(const unsigned char* s) { return wordchar_or_il_ann_char(s, interlinear_annotation_anchor); } -bool wordchar_or_il_ann_terminator(const unsigned char* s) -{ +bool +wordchar_or_il_ann_terminator(const unsigned char* s) { return wordchar_or_il_ann_char(s, interlinear_annotation_terminator); } -bool nonwordchar(const unsigned char* s) -{ +bool +nonwordchar(const unsigned char* s) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); @@ -79,8 +75,7 @@ bool nonwordchar(const unsigned char* s) } bool -il_ann_char(const unsigned char* s, char32_t annotation_char) -{ +il_ann_char(const unsigned char* s, char32_t annotation_char) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); @@ -91,20 +86,17 @@ il_ann_char(const unsigned char* s, char32_t annotation_char) } bool -il_ann_anchor_char(const unsigned char* s) -{ +il_ann_anchor_char(const unsigned char* s) { return il_ann_char(s, interlinear_annotation_anchor); } bool -il_ann_separator_char(const unsigned char* s) -{ +il_ann_separator_char(const unsigned char* s) { return il_ann_char(s, interlinear_annotation_separator); } bool -il_ann_terminator_char(const unsigned char* s) -{ +il_ann_terminator_char(const unsigned char* s) { return il_ann_char(s, interlinear_annotation_terminator); } @@ -113,8 +105,8 @@ il_ann_terminator_char(const unsigned char* s) * beginning of the next/last word) * @return The number of bytes moved */ -int complete_word(unsigned char* start, ssize_t length, - const unsigned char*& ptr, off_t increment) +int +complete_word(unsigned char* start, ssize_t length, const unsigned char*& ptr, off_t increment) { bool (*chartest)(const unsigned char*); int moved = 0; @@ -143,11 +135,9 @@ int complete_word(unsigned char* start, ssize_t length, whitespace_elim = true; // Change direction of scan increment = -increment; - if (increment > 0) { - chartest = wordchar_or_il_ann_anchor; - } else { - chartest = wordchar_or_il_ann_terminator; - } + chartest = (increment > 0) + ? wordchar_or_il_ann_anchor + : wordchar_or_il_ann_terminator; } } else { // Found a wordchar at pointer @@ -155,18 +145,15 @@ int complete_word(unsigned char* start, ssize_t length, // for "non-wordness". Otherwise we might add an extra word if (increment > 0) { const unsigned char* pre_ptr = ptr; - int cur_move = Fast_UnicodeUtil::UTF8move(start, length, - pre_ptr, -1); + int cur_move = Fast_UnicodeUtil::UTF8move(start, length, pre_ptr, -1); if (!wordchar(pre_ptr) && !il_ann_terminator_char(pre_ptr)) // Points at start of new word { whitespace_elim = true; // Change direction of scan increment = -increment; - if (increment > 0) { - chartest = wordchar_or_il_ann_anchor; - } else { - chartest = wordchar_or_il_ann_terminator; - } + chartest = (increment > 0) + ? wordchar_or_il_ann_anchor + : wordchar_or_il_ann_terminator; ptr = pre_ptr; moved += cur_move; } else { @@ -182,8 +169,7 @@ int complete_word(unsigned char* start, ssize_t length, for (;;) { LOG(spam, "[%s%d%s%c]", (whitespace_elim ? "^" : ""), moved, (increment > 0 ? "+" : "-"), printable_char(*ptr)); - int cur_move = Fast_UnicodeUtil::UTF8move(start, length, - ptr, increment); + int cur_move = Fast_UnicodeUtil::UTF8move(start, length, ptr, increment); // give up if past end of read (may still be a successful move // ending at the first character outside of the start+length @@ -229,8 +215,7 @@ int complete_word(unsigned char* start, ssize_t length, moved += cur_move; continue; } - LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", printable_char(*ptr), - *ptr, cur_move); + LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", printable_char(*ptr), *ptr, cur_move); // count this character (it is the first blank/wordchar) // only if we are going forward and it is a word character // since we are then supposed to be pointing to the first @@ -244,8 +229,7 @@ int complete_word(unsigned char* start, ssize_t length, if (moved >= MAX_SCAN_WORD && (chartest != il_ann_anchor_char) && (chartest != il_ann_terminator_char)) { - LOG(spam, "Word length extended max word length %d, " - "breaking at char 0x%x", MAX_SCAN_WORD, *ptr); + LOG(spam, "Word length extended max word length %d, breaking at char 0x%x", MAX_SCAN_WORD, *ptr); break; } } @@ -265,8 +249,7 @@ int complete_word(unsigned char* start, ssize_t length, } -SummaryDesc::highlight_desc::highlight_desc(off_t pos, - ssize_t len, bool highlight) +SummaryDesc::highlight_desc::highlight_desc(off_t pos, ssize_t len, bool highlight) : _pos(pos), _len(len), _highlight(highlight) { LOG(spam, "-- new desc: pos %" PRId64 " len %ld %s", @@ -292,7 +275,7 @@ SummaryDesc::SummaryDesc(Matcher* matcher, ssize_t length, ssize_t min_length, _max_matches(max_matches), _match_elems(), _document_length(matcher->DocumentSize()), - _fulldoc() + _fulldoc() { /* Check if the whole document fits within requested length and * process this @@ -329,7 +312,8 @@ SummaryDesc::SummaryDesc(Matcher* matcher, ssize_t length, ssize_t min_length, SummaryDesc::~SummaryDesc() = default; -void SummaryDesc::locate_accidential_matches() +void +SummaryDesc::locate_accidential_matches() { key_occ_vector::const_iterator kit = _occ.begin(); @@ -403,8 +387,7 @@ void SummaryDesc::locate_accidential_matches() _plist.insert(pit, highlight_desc(d->_pos, start_len, false)); // new keyword - print_list::iterator kwit = - _plist.insert(pit, highlight_desc(kpos, klen, true)); + print_list::iterator kwit = _plist.insert(pit, highlight_desc(kpos, klen, true)); if (end_len) { LOG(spam, "-- Was: (%" PRId64 ", %" PRId64 ")", static_cast(d->_pos), static_cast(d->_len)); @@ -461,7 +444,8 @@ void SummaryDesc::locate_accidential_matches() /* find a proper amount of matches */ -int SummaryDesc::find_matches() +int +SummaryDesc::find_matches() { int match_len = 0; int match_count = 0; @@ -470,11 +454,7 @@ int SummaryDesc::find_matches() _est_len = 0; // Find enough proper matches (without overlap) - for (match_candidate_set::iterator it = _match_results.begin(); - it != _match_results.end(); - ++it) - { - MatchCandidate* m = (*it); + for (MatchCandidate* m : _match_results) { if (overlap(m)) continue; @@ -482,9 +462,7 @@ int SummaryDesc::find_matches() assert(size >= 0); m->make_keylist(); - keylist& klist = m->_klist; - assert(klist.size() > 0); - (void) klist; + assert(m->_klist.size() > 0); _clist.insert(m); @@ -503,14 +481,11 @@ int SummaryDesc::find_matches() match_count++; match_elems += m->elems(); - _est_len = match_len - adjust_len - + (2*(_surround_len)+MIN_CONTINUATION)*match_count; - if (_est_len >= (int)_min_length - && match_count >= _max_matches) + _est_len = match_len - adjust_len + (2*(_surround_len)+MIN_CONTINUATION)*match_count; + if (_est_len >= (int)_min_length && match_count >= _max_matches) break; } - LOG(spam, "QHL: %d matches, raw len %d, estimated len %d, elements %d", - match_count, match_len, _est_len, match_elems); + LOG(spam, "QHL: %d matches, raw len %d, estimated len %d, elements %d", match_count, match_len, _est_len, match_elems); // Quick estimate of the query word length _hit_len = 5*match_elems; @@ -520,7 +495,8 @@ int SummaryDesc::find_matches() /** Check if a character is a configured connector character */ -bool SummaryDesc::word_connector(const unsigned char* s) +bool +SummaryDesc::word_connector(const unsigned char* s) { unsigned char c = *s; if (c & 0x80) { @@ -539,11 +515,11 @@ bool SummaryDesc::word_connector(const unsigned char* s) * legal connector characters. * @return The number of bytes moved */ -int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, - const unsigned char*& ptr, off_t increment) +int +SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, const unsigned char*& ptr, off_t increment) { int moved = 0; - const unsigned char *old_ptr = NULL; + const unsigned char *old_ptr = nullptr; for (;;) { // Start by moving to the start/end of the word.. moved += complete_word(start, length, ptr, increment); @@ -563,8 +539,7 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, // Position to previous/next character to check if this is a // "real" break: if (increment < 0) { - prelen = Fast_UnicodeUtil::UTF8move(start, length, - preptr, increment); + prelen = Fast_UnicodeUtil::UTF8move(start, length, preptr, increment); if (!prelen) return moved; } else { @@ -577,12 +552,10 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, return moved; } char wconn = *preptr; - (void) wconn; LOG(spam, "Found word connector case candidate (%c)", printable_char(wconn)); // Read the character before/after the connector character: - int addlen = Fast_UnicodeUtil::UTF8move(start, length, - preptr, increment); + int addlen = Fast_UnicodeUtil::UTF8move(start, length, preptr, increment); if (!addlen) return moved; // Not possible to extend anything here @@ -595,9 +568,10 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, return moved; } - // If a block of chinese data does not contain any spaces we have to return - // here in order to avoid searching all the way to the start/end. - return moved; + // If a block of chinese data does not contain any spaces we have to return + // here in order to avoid searching all the way to the start/end. + // TODO Hard to tell how teh code below can be executed.... + return moved; // Ok, found a separator case, include another word.. @@ -606,8 +580,7 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, // previous char to see if we are at the start of a word, so // we have to move forward once here: if (increment > 0) { - addlen = Fast_UnicodeUtil::UTF8move(start, length, - preptr, increment); + addlen = Fast_UnicodeUtil::UTF8move(start, length, preptr, increment); if (!addlen) return moved; moved += addlen; @@ -631,8 +604,7 @@ SummaryDesc::get_summary(const char* buffer, size_t bytes, const SummaryConfig* ssize_t prev_end = 0; bool start_cont = false; // Set if this segment has been continued at the start - LOG(debug, "start get_summary, substrings: %ld, est. length: %d", - _plist.size(), _est_len); + LOG(debug, "start get_summary, substrings: %ld, est. length: %d", _plist.size(), _est_len); // Set the current summary config. Implies that get_summary is // not MT safe wrt. this SummaryDesc (not a very heavy // restriction..) @@ -669,8 +641,7 @@ SummaryDesc::get_summary(const char* buffer, size_t bytes, const SummaryConfig* // In spite of precautions keyword hits came so tight that // we got ourselves an overlap after all. Just skip // whatever needed.. - LOG(spam, "Overlap elim during string buildup: " - "previous end %" PRId64 ", current pos %" PRId64, + LOG(spam, "Overlap elim during string buildup: previous end %" PRId64 ", current pos %" PRId64, static_cast(prev_end), static_cast(pos)); if (pos + len <= prev_end) { continue; @@ -694,18 +665,15 @@ SummaryDesc::get_summary(const char* buffer, size_t bytes, const SummaryConfig* * word/starting space tokens (only if previous segment is not * adjacent!) */ - const unsigned char* ptr = - reinterpret_cast(&buffer[pos]); + const unsigned char* ptr = reinterpret_cast(&buffer[pos]); if (!d._highlight && start_cont && prev_end < pos) { // Complete beginning word by extending the prefix - unsigned char* b = - reinterpret_cast(const_cast(buffer)); + unsigned char* b = reinterpret_cast(const_cast(buffer)); int moved = complete_extended_token(b, bytes, ptr, -1); pos -= moved; len += moved; } else if (!d._highlight) { - LOG(spam, "Not completing word at " - "char %c/0x%x, prev_end %" PRId64 ", pos %" PRId64, + LOG(spam, "Not completing word at char %c/0x%x, prev_end %" PRId64 ", pos %" PRId64, printable_char(*ptr), *ptr, static_cast(prev_end), static_cast(pos)); } @@ -721,24 +689,18 @@ SummaryDesc::get_summary(const char* buffer, size_t bytes, const SummaryConfig* // ... in the start or the end or not at all, but overlap // is taken care of in the next loop.. Complete end of // word by appending at the end - unsigned char* b = - reinterpret_cast(const_cast(buffer)); + unsigned char* b = reinterpret_cast(const_cast(buffer)); int moved = complete_extended_token(b, max_len, ptr, +1); len += moved; if ((pos + len) >= next_pos) { - LOG(spam, "Word completion: no space char found - " - "joining at pos %" PRId64, static_cast(next_pos)); + LOG(spam, "Word completion: no space char found - joining at pos %" PRId64, static_cast(next_pos)); } } else if (!d._highlight) { - LOG(spam, "Not completing word at " - "char %c/0x%x, next_pos %" PRId64, - printable_char(*ptr), *ptr, static_cast(next_pos)); + LOG(spam, "Not completing word at char %c/0x%x, next_pos %" PRId64, printable_char(*ptr), *ptr, static_cast(next_pos)); } JD_INVAR(JD_DESC, len >= 0, len = 0, - LOG(error, - "get_summary: Invariant failed, len = %ld", - static_cast(len))); + LOG(error, "get_summary: Invariant failed, len = %ld", static_cast(len))); int add_len = ((int)bytes > len ? len : bytes); LOG(spam, "bytes %zd pos %" PRId64 " len %" PRId64 " %s", @@ -755,34 +717,31 @@ SummaryDesc::get_summary(const char* buffer, size_t bytes, const SummaryConfig* } if (s.size() > 0 && prev_end < (int)_document_length) s.insert(s.end(), sumconf->dots().begin(), sumconf->dots().end()); - LOG(debug, "get_summary: Length of summary %ld bytes %ld chars", - s.size(), a.charLen()); - _sumconf = NULL; // Not valid after this call. + LOG(debug, "get_summary: Length of summary %ld bytes %ld chars", s.size(), a.charLen()); + _sumconf = nullptr; // Not valid after this call. char_size = a.charLen(); return std::string(s.begin(), s.end()); } -bool SummaryDesc::overlap(MatchCandidate* m) +bool +SummaryDesc::overlap(MatchCandidate* m) { // Walk through previous matches - exit if overlap - for (cand_list::iterator it = _clist.begin(); - it != _clist.end(); - ++it) - { + for (MatchCandidate *cand : _clist) { MatchCandidate *m1, *m2; - if ((*it)->starttoken() < m->starttoken()) { - m1 = *it; + if (cand->starttoken() < m->starttoken()) { + m1 = cand; m2 = m; } else { - m2 = *it; + m2 = cand; m1 = m; } if (m1->endpos() > m2->starttoken()) { LOG(spam, "overlap: [%" PRId64 ", %" PRId64 "] <-> [%" PRId64 ", %" PRId64 "]", static_cast(m->starttoken()), static_cast(m->endpos()), - static_cast((*it)->starttoken()), static_cast((*it)->endpos())); + static_cast(cand->starttoken()), static_cast(cand->endpos())); return true; } } @@ -790,7 +749,8 @@ bool SummaryDesc::overlap(MatchCandidate* m) } -int SummaryDesc::recompute_estimate(int len_per_elem) +int +SummaryDesc::recompute_estimate(int len_per_elem) { int new_est = 0; int affected_segments = 0; @@ -824,24 +784,18 @@ int SummaryDesc::recompute_estimate(int len_per_elem) // Only fit one elem at start if (len_per_elem < seglen) { affected_segments++; - LOG(spam, "recompute_estimate prefix " - "(dist %d): len %d (affected)", - seglen, len_per_elem); + LOG(spam, "recompute_estimate prefix (dist %d): len %d (affected)", seglen, len_per_elem); seglen = len_per_elem; } else { - LOG(spam, "recompute_estimate: prefix len %d", - seglen); + LOG(spam, "recompute_estimate: prefix len %d", seglen); } prefix = false; } else if ((len_per_elem << 1) < seglen) { affected_segments +=2; - LOG(spam, "recompute_estimate(dist %d): " - "len %d (affected*2)", - seglen, len_per_elem*2 + MIN_CONTINUATION); + LOG(spam, "recompute_estimate(dist %d): len %d (affected*2)", seglen, len_per_elem*2 + MIN_CONTINUATION); seglen = len_per_elem * 2 + MIN_CONTINUATION; } else { - LOG(spam, "recompute_estimate: mid len %d", - seglen); + LOG(spam, "recompute_estimate: mid len %d", seglen); } new_est += seglen; prev_pos = (*kit)->startpos() + (*kit)->tokenlen; @@ -855,13 +809,11 @@ int SummaryDesc::recompute_estimate(int len_per_elem) LOG(spam, "recompute_estimate: end len %d", xlen); } else { affected_segments++; - LOG(spam, "recompute_estimate: end len %d (affected)", - len_per_elem); + LOG(spam, "recompute_estimate: end len %d (affected)", len_per_elem); new_est += len_per_elem; } - LOG(spam, "recompute_estimate(%d): %d -> %d, affected %d", - len_per_elem, _est_len, new_est, affected_segments); + LOG(spam, "recompute_estimate(%d): %d -> %d, affected %d", len_per_elem, _est_len, new_est, affected_segments); _est_len = new_est; /* Re-set available print length per element (prefix or postfix) */ @@ -873,27 +825,24 @@ int SummaryDesc::recompute_estimate(int len_per_elem) LOG(spam, "recompute_estimate --> %d", len_per_elem); if (affected_segments > 0 && _length > _est_len + MIN_SURROUND_LEN) { - int adj = (_length - _hit_len - - (_est_len + MIN_SURROUND_LEN)) / affected_segments; + int adj = (_length - _hit_len - (_est_len + MIN_SURROUND_LEN)) / affected_segments; // Again re-adjust element length to sensible values if (len_per_elem + adj < MIN_SURROUND_LEN) { - LOG(spam, "recompute_estimate(%d) " - "(below MIN_SURROUND_LEN threshold)", - len_per_elem); + LOG(spam, "recompute_estimate(%d) (below MIN_SURROUND_LEN threshold)", len_per_elem); adj = (MIN_SURROUND_LEN - len_per_elem); len_per_elem = MIN_SURROUND_LEN; } else { len_per_elem += adj; } _est_len += adj * affected_segments; - LOG(spam, "recompute_estimate (adj %d) el.len %d new est_len %d", - adj, len_per_elem, _est_len); + LOG(spam, "recompute_estimate (adj %d) el.len %d new est_len %d", adj, len_per_elem, _est_len); } return len_per_elem; } -void SummaryDesc::build_highlight_descs() +void +SummaryDesc::build_highlight_descs() { /* Set available print length per element (prefix or postfix) */ int len_per_elem; @@ -929,18 +878,9 @@ void SummaryDesc::build_highlight_descs() off_t pos = 0; off_t startpos = 0; - for (cand_list::iterator cit = _clist.begin(); - cit != _clist.end(); - ++cit) - { + for (const auto & cand : _clist) { /* look at each keyword within match */ - keylist& klist = (*cit)->_klist; - - for (keylist::iterator kit = klist.begin(); - kit != klist.end(); - ++kit) - { - key_occ* k = *kit; + for (key_occ * k : cand->_klist) { int max_len = k->startpos() - pos; // the same occurrence may appear twice in a match, in // which case length will be < 0 @@ -976,8 +916,7 @@ void SummaryDesc::build_highlight_descs() if (pos > 0) { // Adding final segment, ensure that there is enough text available.. - int max_len = std::min(len_per_elem, - static_cast(_matcher->DocumentSize() - pos)); + int max_len = std::min(len_per_elem, static_cast(_matcher->DocumentSize() - pos)); add_desc(pos, max_len, false); } LOG(debug, "Summary: start %" PRId64 " end: %" PRId64, static_cast(startpos), static_cast(pos)); @@ -986,19 +925,18 @@ void SummaryDesc::build_highlight_descs() /* create description for the complete document */ -void SummaryDesc::build_fulldoc_desc() +void +SummaryDesc::build_fulldoc_desc() { LOG(debug, "Generating query highlights for complete document"); off_t pos = 0; - for (key_occ_vector::const_iterator kit = _occ.begin(); - kit != _occ.end(); ++kit) - { - int klen = (*kit)->tokenlen; - int kpos = (*kit)->startpos(); + for (const auto & token : _occ) { + int klen = token->tokenlen; + int kpos = token->startpos(); add_desc(pos, kpos - pos, false); // Use valid() info to filter out non-phrase terms if this is // a phrase search: - add_desc(kpos, klen, (!_matcher->UsesValid()) || (*kit)->valid()); + add_desc(kpos, klen, (!_matcher->UsesValid()) || token->valid()); pos = kpos + klen; } add_desc(pos, _matcher->DocumentSize() - pos, false); @@ -1006,12 +944,12 @@ void SummaryDesc::build_fulldoc_desc() } -void SummaryDesc::add_desc(off_t pos, ssize_t len, bool highlight) +void +SummaryDesc::add_desc(off_t pos, ssize_t len, bool highlight) { if (len == 0) return; JD_INVAR(JD_DUMP, len > 0, return, - LOG(info, "add_desc len %ld, %s", static_cast(len), - (highlight ? "highlight" : "")); assert(false)); - _plist.push_back(highlight_desc(pos, len, highlight)); + LOG(info, "add_desc len %ld, %s", static_cast(len), (highlight ? "highlight" : "")); assert(false)); + _plist.emplace_back(pos, len, highlight); }