Switch to unified view

a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp
...
...
33
#include "utf8iter.h"
33
#include "utf8iter.h"
34
#include "textsplit.h"
34
#include "textsplit.h"
35
#include "rcldb.h"
35
#include "rcldb.h"
36
#include "rcldb_p.h"
36
#include "rcldb_p.h"
37
#include "synfamily.h"
37
#include "synfamily.h"
38
#include "unacpp.h"
38
39
39
#include <iostream>
40
#include <iostream>
40
41
41
using namespace std;
42
using namespace std;
42
43
...
...
54
}
55
}
55
56
56
/**
57
/**
57
 * Create database of stem to parents associations for a given language.
58
 * Create database of stem to parents associations for a given language.
58
 */
59
 */
59
bool WritableStemDb::createDb(const string& lang)
60
bool createExpansionDbs(Xapian::WritableDatabase& wdb, 
61
          const vector<string>& langs)
60
{
62
{
61
    LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
63
    LOGDEB(("StemDb::createExpansionDbs\n"));
62
    Chrono cron;
64
    Chrono cron;
63
    createMember(lang);
65
64
    string prefix = entryprefix(lang);
66
    vector<XapWritableSynFamily> stemdbs;
67
    for (unsigned int i = 0; i < langs.size(); i++) {
68
  stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
69
  stemdbs[i].deleteMember(langs[i]);
70
  stemdbs[i].createMember(langs[i]);
71
  stemdbs[i].setCurrentMemberName(langs[i]);
72
    }
65
73
66
    // We walk the list of all terms, and stem each. We skip terms which
74
    // We walk the list of all terms, and stem each. We skip terms which
67
    // don't look like natural language.
75
    // don't look like natural language.
68
    // If the stem is not identical to the term, we add a synonym entry.
76
    // If the stem is not identical to the term, we add a synonym entry.
69
    // Statistics
77
    // Statistics
...
...
71
    int stemconst = 0; // Stem == term
79
    int stemconst = 0; // Stem == term
72
    int allsyns = 0; // Total number of entries created
80
    int allsyns = 0; // Total number of entries created
73
81
74
    string ermsg;
82
    string ermsg;
75
    try {
83
    try {
76
        Xapian::Stem stemmer(lang);
84
  vector<Xapian::Stem> stemmers;
85
  for (unsigned int i = 0; i < langs.size(); i++) {
86
      stemmers.push_back(Xapian::Stem(langs[i]));
87
  }
77
88
78
        for (Xapian::TermIterator it = m_wdb.allterms_begin(); 
89
        for (Xapian::TermIterator it = wdb.allterms_begin(); 
79
         it != m_wdb.allterms_end(); it++) {
90
         it != wdb.allterms_end(); it++) {
80
        // If the term has any non-lowercase 7bit char (that is,
91
        // If the term has any non-lowercase 7bit char (that is,
81
            // numbers, capitals and punctuation) dont stem.
92
            // numbers, capitals and punctuation) dont stem.
82
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
93
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
83
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
94
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
84
                ++nostem;
95
                ++nostem;
...
...
100
        if (TextSplit::isCJK(*utfit)) {
111
        if (TextSplit::isCJK(*utfit)) {
101
        // LOGDEB(("stemskipped: Skipping CJK\n"));
112
        // LOGDEB(("stemskipped: Skipping CJK\n"));
102
        continue;
113
        continue;
103
        }
114
        }
104
115
116
      // Create stemming synonym for every lang
117
      for (unsigned int i = 0; i < langs.size(); i++) {
105
            string stem = stemmer(*it);
118
      string stem = stemmers[i](*it);
106
            LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
119
      if (stem == *it) {
107
                     stem.c_str()));
120
          ++stemconst;
108
            if (stem == *it) {
121
      } else {
109
                ++stemconst;
122
          stemdbs[i].addSynonym(stem, *it);
110
                continue;
123
          LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n", 
111
            }
124
               (*it).c_str(), langs[i].c_str(), stem.c_str()));
112
    
113
      m_wdb.add_synonym(prefix + stem, *it);
114
        ++allsyns;
125
         ++allsyns;
126
      }
127
      }
128
115
        }
129
        }
116
    } XCATCHERROR(ermsg);
130
    } XCATCHERROR(ermsg);
117
    if (!ermsg.empty()) {
131
    if (!ermsg.empty()) {
118
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
132
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
119
        return false;
133
        return false;
120
    }
134
    }
121
135
122
    LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
136
    LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
123
    LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", 
137
    LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", 
124
        nostem, stemconst, allsyns));
138
        nostem, stemconst, allsyns));
125
    return true;
139
    return true;
126
}
140
}
127
141