Download this file

hldata.h    129 lines (113 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#ifndef _hldata_h_included_
#define _hldata_h_included_
#include <vector>
#include <string>
#include <set>
#include <map>
/** Store data about user search terms and their expansions. This is used
* mostly for highlighting result text and walking the matches, generating
* spelling suggestions.
*/
struct HighlightData {
/** The user terms, excluding those with wildcards. This list is
* intended for orthographic suggestions so the terms are always
* lowercased, unaccented or not depending on the type of index
* (as the spelling dictionary is generated from the index terms).
*/
std::set<std::string> uterms;
/** The db query terms linked to the uterms entry they were expanded from.
* This is used for aggregating term stats when generating snippets (for
* choosing the best terms, allocating slots, etc. )
*/
std::map<std::string, std::string> terms;
/** The original user terms-or-groups. This is for display
* purposes: ie when creating a menu to look for a specific
* matched group inside a preview window. We want to show the
* user-entered data in the menu, not some transformation, so
* these are always raw, diacritics and case preserved.
*/
std::vector<std::vector<std::string> > ugroups;
/** Processed/expanded terms and groups. Used for looking for
* regions to highlight. A group can be a PHRASE or NEAR entry (we
* process everything as NEAR to keep things reasonably
* simple. Terms are just groups with 1 entry. All
* terms are transformed to be compatible with index content
* (unaccented and lowercased as needed depending on
* configuration), and the list may include values
* expanded from the original terms by stem or wildcard expansion.
*/
std::vector<std::vector<std::string> > groups;
/** Group slacks. Parallel to groups */
std::vector<int> slacks;
/** Index into ugroups for each group. Parallel to groups. As a
* user term or group may generate many processed/expanded terms
* or groups, this is how we relate an expansion to its source
* (used, e.g. for generating anchors for walking search matches
* in the preview window).
*/
std::vector<size_t> grpsugidx;
void clear()
{
uterms.clear();
ugroups.clear();
groups.clear();
slacks.clear();
grpsugidx.clear();
}
void append(const HighlightData&);
// Print (debug)
void toString(std::string& out);
};
inline void setWinMinMax(int pos, int& sta, int& sto)
{
if (pos < sta) {
sta = pos;
}
if (pos > sto) {
sto = pos;
}
}
// Check that at least an entry from the first position list is inside
// the window and recurse on next list. The window is readjusted as
// the successive terms are found. Mostly copied from Xapian code.
//
// @param window the search window width
// @param plists the position list vector
// @param i the position list to process (we then recurse with the next list)
// @param min the current minimum pos for a found term
// @param max the current maximum pos for a found term
// @param sp, ep output: the found area
// @param minpos bottom of search: this is the highest point of
// any previous match. We don't look below this as overlapping matches
// make no sense for highlighting.
extern bool do_proximity_test(
int window, std::vector<const std::vector<int>*>& plists,
unsigned int i, int min, int max, int *sp, int *ep, int minpos);
/**** The following is used by plaintorich.cpp for finding zones to
highlight and by rclabsfromtext.cpp to choose fragments for the
abstract */
struct GroupMatchEntry {
// Start/End byte offsets in the document text
std::pair<int, int> offs;
// Index of the search group this comes from: this is to relate a
// match to the original user input.
size_t grpidx;
GroupMatchEntry(int sta, int sto, size_t idx)
: offs(sta, sto), grpidx(idx) {
}
};
// Find NEAR matches for one group of terms.
//
// @param hldata Data about the user query
// @param grpidx Index in hldata.groups for the group we process
// @param inplists Position lists for the the group terms
// @param gpostobytes Translation of term position to start/end byte offsets
// @param[out] tboffs Found matches
extern bool matchGroup(
const HighlightData& hldata,
unsigned int grpidx,
const std::map<std::string, std::vector<int>>& inplists,
const std::map<int, std::pair<int,int>>& gpostobytes,
std::vector<GroupMatchEntry>& tboffs
);
#endif /* _hldata_h_included_ */