|
a/src/internfile/internfile.cpp |
|
b/src/internfile/internfile.cpp |
|
... |
|
... |
49 |
|
49 |
|
50 |
#ifdef RCL_USE_XATTR
|
50 |
#ifdef RCL_USE_XATTR
|
51 |
#include "pxattr.h"
|
51 |
#include "pxattr.h"
|
52 |
#endif // RCL_USE_XATTR
|
52 |
#endif // RCL_USE_XATTR
|
53 |
|
53 |
|
54 |
static const string stxtplain("text/plain");
|
54 |
static const string cstr_stxtplain("text/plain");
|
55 |
|
55 |
|
56 |
// The internal path element separator. This can't be the same as the rcldb
|
56 |
// The internal path element separator. This can't be the same as the rcldb
|
57 |
// file to ipath separator : "|"
|
57 |
// file to ipath separator : "|"
|
58 |
// We replace it with a control char if it comes out of a filter (ie:
|
58 |
// We replace it with a control char if it comes out of a filter (ie:
|
59 |
// rclzip or rclchm can do this). If you want the SOH control char
|
59 |
// rclzip or rclchm can do this). If you want the SOH control char
|
60 |
// inside an ipath, you're out of luck (and a bit weird).
|
60 |
// inside an ipath, you're out of luck (and a bit weird).
|
61 |
static const string isep(":");
|
61 |
static const string cstr_isep(":");
|
|
|
62 |
|
62 |
static const char colon_repl = '\x01';
|
63 |
static const char cchar_colon_repl = '\x01';
|
63 |
static string colon_hide(const string& in)
|
64 |
static string colon_hide(const string& in)
|
64 |
{
|
65 |
{
|
65 |
string out;
|
66 |
string out;
|
66 |
for (string::const_iterator it = in.begin(); it != in.end(); it++) {
|
67 |
for (string::const_iterator it = in.begin(); it != in.end(); it++) {
|
67 |
out += *it == ':' ? colon_repl : *it;
|
68 |
out += *it == ':' ? cchar_colon_repl : *it;
|
68 |
}
|
69 |
}
|
69 |
return out;
|
70 |
return out;
|
70 |
}
|
71 |
}
|
71 |
static string colon_restore(const string& in)
|
72 |
static string colon_restore(const string& in)
|
72 |
{
|
73 |
{
|
73 |
string out;
|
74 |
string out;
|
74 |
for (string::const_iterator it = in.begin(); it != in.end(); it++) {
|
75 |
for (string::const_iterator it = in.begin(); it != in.end(); it++) {
|
75 |
out += *it == colon_repl ? ':' : *it;
|
76 |
out += *it == cchar_colon_repl ? ':' : *it;
|
76 |
}
|
77 |
}
|
77 |
return out;
|
78 |
return out;
|
78 |
}
|
79 |
}
|
79 |
|
80 |
|
80 |
#ifdef RCL_USE_XATTR
|
81 |
#ifdef RCL_USE_XATTR
|
|
... |
|
... |
113 |
string::size_type colon;
|
114 |
string::size_type colon;
|
114 |
LOGDEB(("FileInterner::getEnclosing(): url [%s] ipath [%s]\n",
|
115 |
LOGDEB(("FileInterner::getEnclosing(): url [%s] ipath [%s]\n",
|
115 |
url.c_str(), eipath.c_str()));
|
116 |
url.c_str(), eipath.c_str()));
|
116 |
if (eipath.empty())
|
117 |
if (eipath.empty())
|
117 |
return false;
|
118 |
return false;
|
118 |
if ((colon = eipath.find_last_of(isep)) != string::npos) {
|
119 |
if ((colon = eipath.find_last_of(cstr_isep)) != string::npos) {
|
119 |
eipath.erase(colon);
|
120 |
eipath.erase(colon);
|
120 |
} else {
|
121 |
} else {
|
121 |
eipath.erase();
|
122 |
eipath.erase();
|
122 |
}
|
123 |
}
|
123 |
make_udi(url_gpath(eurl), eipath, udi);
|
124 |
make_udi(url_gpath(eurl), eipath, udi);
|
|
... |
|
... |
363 |
m_forPreview = ((flags & FIF_forPreview) != 0);
|
364 |
m_forPreview = ((flags & FIF_forPreview) != 0);
|
364 |
// Initialize handler stack.
|
365 |
// Initialize handler stack.
|
365 |
m_handlers.reserve(MAXHANDLERS);
|
366 |
m_handlers.reserve(MAXHANDLERS);
|
366 |
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
367 |
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
367 |
m_tmpflgs[i] = false;
|
368 |
m_tmpflgs[i] = false;
|
368 |
m_targetMType = stxtplain;
|
369 |
m_targetMType = cstr_stxtplain;
|
369 |
}
|
370 |
}
|
370 |
|
371 |
|
371 |
// We used a single beagle cache object to access beagle data. We protect it
|
372 |
// We used a single beagle cache object to access beagle data. We protect it
|
372 |
// against multiple thread access.
|
373 |
// against multiple thread access.
|
373 |
static PTMutexInit o_lock;
|
374 |
static PTMutexInit o_beagler_mutex;
|
374 |
|
375 |
|
375 |
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf,
|
376 |
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf,
|
376 |
TempDir& td, int flags)
|
377 |
TempDir& td, int flags)
|
377 |
: m_tdir(td), m_ok(false), m_missingdatap(0)
|
378 |
: m_tdir(td), m_ok(false), m_missingdatap(0)
|
378 |
{
|
379 |
{
|
|
... |
|
... |
420 |
return;
|
421 |
return;
|
421 |
}
|
422 |
}
|
422 |
string udi = it->second;
|
423 |
string udi = it->second;
|
423 |
|
424 |
|
424 |
{
|
425 |
{
|
425 |
PTMutexLocker locker(o_lock);
|
426 |
PTMutexLocker locker(o_beagler_mutex);
|
426 |
// Retrieve from our webcache (beagle data). The beagler
|
427 |
// Retrieve from our webcache (beagle data). The beagler
|
427 |
// object is created at the first call of this routine and
|
428 |
// object is created at the first call of this routine and
|
428 |
// deleted when the program exits.
|
429 |
// deleted when the program exits.
|
429 |
static BeagleQueueCache beagler(cnf);
|
430 |
static BeagleQueueCache o_beagler(cnf);
|
430 |
if (!beagler.getFromCache(udi, dotdoc, data)) {
|
431 |
if (!o_beagler.getFromCache(udi, dotdoc, data)) {
|
431 |
LOGINFO(("FileInterner:: failed fetch from Beagle cache for [%s]\n",
|
432 |
LOGINFO(("FileInterner:: failed fetch from Beagle cache for [%s]\n",
|
432 |
udi.c_str()));
|
433 |
udi.c_str()));
|
433 |
return;
|
434 |
return;
|
434 |
}
|
435 |
}
|
435 |
}
|
436 |
}
|
|
... |
|
... |
562 |
}
|
563 |
}
|
563 |
|
564 |
|
564 |
// These defs are for the Dijon meta array. Rcl::Doc predefined field
|
565 |
// These defs are for the Dijon meta array. Rcl::Doc predefined field
|
565 |
// names are used where appropriate. In some cases, Rcl::Doc names are
|
566 |
// names are used where appropriate. In some cases, Rcl::Doc names are
|
566 |
// used inside the Dijon metadata (ex: origcharset)
|
567 |
// used inside the Dijon metadata (ex: origcharset)
|
567 |
static const string keyau("author");
|
568 |
static const string cstr_keyau("author");
|
568 |
static const string keycs("charset");
|
569 |
static const string cstr_keycs("charset");
|
569 |
static const string keyct("content");
|
570 |
static const string cstr_keyct("content");
|
570 |
static const string keyds("description");
|
571 |
static const string cstr_keyds("description");
|
571 |
static const string keyfn("filename");
|
572 |
static const string cstr_keyfn("filename");
|
572 |
static const string keymd("modificationdate");
|
573 |
static const string cstr_keymd("modificationdate");
|
573 |
static const string keymt("mimetype");
|
574 |
static const string cstr_keymt("mimetype");
|
574 |
static const string keytt("title");
|
575 |
static const string cstr_keytt("title");
|
575 |
|
576 |
|
576 |
bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
577 |
bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
577 |
{
|
578 |
{
|
578 |
Dijon::Filter *df = m_handlers.back();
|
579 |
Dijon::Filter *df = m_handlers.back();
|
579 |
if (df == 0) {
|
580 |
if (df == 0) {
|
|
... |
|
... |
583 |
}
|
584 |
}
|
584 |
const map<string, string>& docdata = df->get_meta_data();
|
585 |
const map<string, string>& docdata = df->get_meta_data();
|
585 |
|
586 |
|
586 |
for (map<string,string>::const_iterator it = docdata.begin();
|
587 |
for (map<string,string>::const_iterator it = docdata.begin();
|
587 |
it != docdata.end(); it++) {
|
588 |
it != docdata.end(); it++) {
|
588 |
if (it->first == keyct) {
|
589 |
if (it->first == cstr_keyct) {
|
589 |
doc.text = it->second;
|
590 |
doc.text = it->second;
|
590 |
} else if (it->first == keymd) {
|
591 |
} else if (it->first == cstr_keymd) {
|
591 |
doc.dmtime = it->second;
|
592 |
doc.dmtime = it->second;
|
592 |
} else if (it->first == Rcl::Doc::keyoc) {
|
593 |
} else if (it->first == Rcl::Doc::keyoc) {
|
593 |
doc.origcharset = it->second;
|
594 |
doc.origcharset = it->second;
|
594 |
} else if (it->first == keymt || it->first == keycs) {
|
595 |
} else if (it->first == cstr_keymt || it->first == cstr_keycs) {
|
595 |
// don't need/want these.
|
596 |
// don't need/want these.
|
596 |
} else {
|
597 |
} else {
|
597 |
doc.meta[it->first] = it->second;
|
598 |
doc.meta[it->first] = it->second;
|
598 |
}
|
599 |
}
|
599 |
}
|
600 |
}
|
600 |
if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[keyds].empty()) {
|
601 |
if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_keyds].empty()) {
|
601 |
doc.meta[Rcl::Doc::keyabs] = doc.meta[keyds];
|
602 |
doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_keyds];
|
602 |
doc.meta.erase(keyds);
|
603 |
doc.meta.erase(cstr_keyds);
|
603 |
}
|
604 |
}
|
604 |
return true;
|
605 |
return true;
|
605 |
}
|
606 |
}
|
606 |
|
607 |
|
607 |
// Collect the ipath from the current path in the document tree.
|
608 |
// Collect the ipath from the current path in the document tree.
|
|
... |
|
... |
633 |
const map<string, string>& docdata = (*hit)->get_meta_data();
|
634 |
const map<string, string>& docdata = (*hit)->get_meta_data();
|
634 |
if (getKeyValue(docdata, "ipath", ipathel)) {
|
635 |
if (getKeyValue(docdata, "ipath", ipathel)) {
|
635 |
if (!ipathel.empty()) {
|
636 |
if (!ipathel.empty()) {
|
636 |
// We have a non-empty ipath
|
637 |
// We have a non-empty ipath
|
637 |
hasipath = true;
|
638 |
hasipath = true;
|
638 |
getKeyValue(docdata, keymt, doc.mimetype);
|
639 |
getKeyValue(docdata, cstr_keymt, doc.mimetype);
|
639 |
getKeyValue(docdata, keyfn, doc.utf8fn);
|
640 |
getKeyValue(docdata, cstr_keyfn, doc.utf8fn);
|
640 |
}
|
641 |
}
|
641 |
doc.ipath += colon_hide(ipathel) + isep;
|
642 |
doc.ipath += colon_hide(ipathel) + cstr_isep;
|
642 |
} else {
|
643 |
} else {
|
643 |
doc.ipath += isep;
|
644 |
doc.ipath += cstr_isep;
|
644 |
}
|
645 |
}
|
645 |
getKeyValue(docdata, keyau, doc.meta[Rcl::Doc::keyau]);
|
646 |
getKeyValue(docdata, cstr_keyau, doc.meta[Rcl::Doc::keyau]);
|
646 |
getKeyValue(docdata, keymd, doc.dmtime);
|
647 |
getKeyValue(docdata, cstr_keymd, doc.dmtime);
|
647 |
}
|
648 |
}
|
648 |
|
649 |
|
649 |
// Trim empty tail elements in ipath.
|
650 |
// Trim empty tail elements in ipath.
|
650 |
if (hasipath) {
|
651 |
if (hasipath) {
|
651 |
LOGDEB2(("IPATH [%s]\n", doc.ipath.c_str()));
|
652 |
LOGDEB2(("IPATH [%s]\n", doc.ipath.c_str()));
|
652 |
string::size_type sit = doc.ipath.find_last_not_of(isep);
|
653 |
string::size_type sit = doc.ipath.find_last_not_of(cstr_isep);
|
653 |
if (sit == string::npos)
|
654 |
if (sit == string::npos)
|
654 |
doc.ipath.erase();
|
655 |
doc.ipath.erase();
|
655 |
else if (sit < doc.ipath.length() -1)
|
656 |
else if (sit < doc.ipath.length() -1)
|
656 |
doc.ipath.erase(sit+1);
|
657 |
doc.ipath.erase(sit+1);
|
657 |
} else {
|
658 |
} else {
|
|
... |
|
... |
679 |
// and possibly add a filter/handler to the stack
|
680 |
// and possibly add a filter/handler to the stack
|
680 |
int FileInterner::addHandler()
|
681 |
int FileInterner::addHandler()
|
681 |
{
|
682 |
{
|
682 |
const map<string, string>& docdata = m_handlers.back()->get_meta_data();
|
683 |
const map<string, string>& docdata = m_handlers.back()->get_meta_data();
|
683 |
string charset, mimetype;
|
684 |
string charset, mimetype;
|
684 |
getKeyValue(docdata, keycs, charset);
|
685 |
getKeyValue(docdata, cstr_keycs, charset);
|
685 |
getKeyValue(docdata, keymt, mimetype);
|
686 |
getKeyValue(docdata, cstr_keymt, mimetype);
|
686 |
|
687 |
|
687 |
LOGDEB(("FileInterner::addHandler: next_doc is %s\n", mimetype.c_str()));
|
688 |
LOGDEB(("FileInterner::addHandler: next_doc is %s\n", mimetype.c_str()));
|
688 |
|
689 |
|
689 |
// If we find a document of the target type (text/plain in
|
690 |
// If we find a document of the target type (text/plain in
|
690 |
// general), we're done decoding. If we hit text/plain, we're done
|
691 |
// general), we're done decoding. If we hit text/plain, we're done
|
691 |
// in any case
|
692 |
// in any case
|
692 |
if (!stringicmp(mimetype, m_targetMType) ||
|
693 |
if (!stringicmp(mimetype, m_targetMType) ||
|
693 |
!stringicmp(mimetype, stxtplain)) {
|
694 |
!stringicmp(mimetype, cstr_stxtplain)) {
|
694 |
m_reachedMType = mimetype;
|
695 |
m_reachedMType = mimetype;
|
695 |
LOGDEB1(("FileInterner::addHandler: target reached\n"));
|
696 |
LOGDEB1(("FileInterner::addHandler: target reached\n"));
|
696 |
return ADD_BREAK;
|
697 |
return ADD_BREAK;
|
697 |
}
|
698 |
}
|
698 |
|
699 |
|
|
... |
|
... |
721 |
// copying the text, which may be big.
|
722 |
// copying the text, which may be big.
|
722 |
string ns;
|
723 |
string ns;
|
723 |
const string *txt = &ns;
|
724 |
const string *txt = &ns;
|
724 |
{
|
725 |
{
|
725 |
map<string,string>::const_iterator it;
|
726 |
map<string,string>::const_iterator it;
|
726 |
it = docdata.find(keyct);
|
727 |
it = docdata.find(cstr_keyct);
|
727 |
if (it != docdata.end())
|
728 |
if (it != docdata.end())
|
728 |
txt = &it->second;
|
729 |
txt = &it->second;
|
729 |
}
|
730 |
}
|
730 |
bool setres = false;
|
731 |
bool setres = false;
|
731 |
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
732 |
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
|
... |
|
... |
793 |
// when they're pushed on the stack
|
794 |
// when they're pushed on the stack
|
794 |
vector<string> vipath;
|
795 |
vector<string> vipath;
|
795 |
int vipathidx = 0;
|
796 |
int vipathidx = 0;
|
796 |
if (!ipath.empty()) {
|
797 |
if (!ipath.empty()) {
|
797 |
vector<string> lipath;
|
798 |
vector<string> lipath;
|
798 |
stringToTokens(ipath, lipath, isep, true);
|
799 |
stringToTokens(ipath, lipath, cstr_isep, true);
|
799 |
for (vector<string>::iterator it = lipath.begin();
|
800 |
for (vector<string>::iterator it = lipath.begin();
|
800 |
it != lipath.end(); it++) {
|
801 |
it != lipath.end(); it++) {
|
801 |
*it = colon_restore(*it);
|
802 |
*it = colon_restore(*it);
|
802 |
}
|
803 |
}
|
803 |
vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
|
804 |
vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
|