Parent: [b28eaf] (diff)

Child: [7048d2] (diff)

Download this file

fileudi.cpp    132 lines (114 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_FILEUDI
#include "autoconfig.h"
#include <stdio.h>
#include <cstdlib>
#include "fileudi.h"
#include "md5.h"
#include "base64.h"
using std::string;
// Debug only
#ifdef PATHHASH_HEX
static void md5hexprint(const unsigned char hash[16], string &out)
{
out.erase();
out.reserve(33);
static const char hex[]="0123456789abcdef";
for (int i = 0; i < 16; i++) {
out.append(1, hex[hash[i] >> 4]);
out.append(1, hex[hash[i] & 0x0f]);
}
}
#endif
// Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars)
#define HASHLEN 22
// Convert longish paths by truncating and appending hash of path
// The full length of the base64-encoded (minus pad) of the md5 is 22 chars
// We append this to the truncated path
void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
{
if (maxlen < HASHLEN) {
fprintf(stderr, "pathHash: internal error: requested len too small\n");
abort();
}
if (path.length() <= maxlen) {
phash = path;
return;
}
// Compute the md5
unsigned char chash[16];
MD5_CTX ctx;
MD5Init(&ctx);
MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN),
path.length() - (maxlen - HASHLEN));
MD5Final(chash, &ctx);
#ifdef PATHHASH_HEX
string hex;
md5hexprint(chash, hex);
printf("hex [%s]\n", hex.c_str());
#endif
// Encode it to ascii. This shouldn't be strictly necessary as
// xapian terms can be binary
string hash;
base64_encode(string((char *)chash, 16), hash);
// We happen to know there will be 2 pad chars in there, that we
// don't need as this won't ever be decoded. Resulting length is 22
hash.resize(hash.length() - 2);
// Truncate path and append hash
phash = path.substr(0, maxlen - HASHLEN) + hash;
}
// Maximum length for path/unique terms stored for each document. We truncate
// longer paths and uniquize them by appending a hashed value. This
// is done to avoid xapian max term length limitations, not
// to gain space (we gain very little even with very short maxlens
// like 30). The xapian max key length seems to be around 250.
// The value for PATHHASHLEN includes the length of the hash part.
#define PATHHASHLEN 150
// Compute the unique term used to link documents to their file-system source:
// Hashed path + possible internal path
void make_udi(const string& fn, const string& ipath, string &udi)
{
string s(fn);
// Note that we append a "|" in all cases. Historical, could be removed
s.append("|");
s.append(ipath);
pathHash(s, udi, PATHHASHLEN);
return;
}
#else // TEST_FILEUDI
#include <stdio.h>
#include <string>
#include "fileudi.h"
using namespace std;
int main(int argc, char **argv)
{
string path="/usr/lib/toto.cpp";
string ipath = "1:2:3:4:5:10";
string udi;
make_udi(path, ipath, udi);
printf("udi [%s]\n", udi.c_str());
path = "/some/much/too/looooooooooooooong/path/bla/bla/bla"
"/looooooooooooooong/path/bla/bla/bla/llllllllllllllllll"
"/looooooooooooooong/path/bla/bla/bla/llllllllllllllllll";
ipath = "1:2:3:4:5:10"
"1:2:3:4:5:10"
"1:2:3:4:5:10";
make_udi(path, ipath, udi);
printf("udi [%s]\n", udi.c_str());
}
#endif // TEST_FILEUDI