recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_text.cpp [38e095] .. [4e2266]

Switch to unified view


...
#include "cstr.h"
#include "mh_text.h"
#include "csguess.h"
#include "debuglog.h"
#include "readfile.h"

#include "md5.h"
#include "rclconfig.h"

const int MB = 1024*1024;
const int KB = 1024;
...
    LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc)));

    if (m_havedoc == false)
    return false;

    // We transcode even if defcharset is supposedly already utf-8:
    // this validates the encoding.













    m_metaData[cstr_origcharset] = m_dfltInputCharset;

    m_metaData[cstr_mimetype] = cstr_textplain;

    size_t srclen = m_text.length();
    m_metaData[cstr_content].swap(m_text);

    // txtdcode() truncates the text if transcoding fails
    (void)txtdcode("mh_text");


    // If the text length is 0 (the file is empty or oversize), or we are 
    // not paging, we're done
    if (srclen == 0 || !m_paging) {
        m_havedoc = false;
        return true;
    } else {
        // Paging: set ipath then read next chunk. 

...
        // records for small files (one for the file, one for the
        // first chunk). This is a hack. The right thing to do would
        // be to use a different mtype for files over the page size,
        // and keep text/plain only for smaller files.
        char buf[30];
        sprintf(buf, "%lld", (long long)(m_offs - srclen));
        if (m_offs - srclen != 0)
            m_metaData[cstr_ipath] = buf;
        readnext();
        return true;
    }
}

bool MimeHandlerText::readnext()
{
    string reason;
    m_text.clear();
    if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
        LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
        m_havedoc = false;
        return false;
    }

	a/src/internfile/mh_text.cpp		b/src/internfile/mh_text.cpp
	...		...
30	#include "cstr.h"	30	#include "cstr.h"
31	#include "mh_text.h"	31	#include "mh_text.h"
32	#include "csguess.h"	32	#include "csguess.h"
33	#include "debuglog.h"	33	#include "debuglog.h"
34	#include "readfile.h"	34	#include "readfile.h"
35	#include "transcode.h"
36	#include "md5.h"	35	#include "md5.h"
37	#include "rclconfig.h"	36	#include "rclconfig.h"
38		37
39	const int MB = 1024*1024;	38	const int MB = 1024*1024;
40	const int KB = 1024;	39	const int KB = 1024;
	...		...
115	LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc)));	114	LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc)));
116		115
117	if (m_havedoc == false)	116	if (m_havedoc == false)
118	return false;	117	return false;
119		118
120	// We transcode even if defcharset is already utf-8:	119	// We transcode even if defcharset is supposedly already utf-8:
121	// this validates the encoding.	120	// this validates the encoding.
122	LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
123	m_dfltInputCharset.c_str()));
124	int ecnt;
125	bool ret;
126	string& itext = m_metaData[cstr_content];
127	if (!(ret=transcode(m_text, itext, m_dfltInputCharset, "UTF-8", &ecnt)) \|\|
128	ecnt > int(itext.size() / 4)) {
129	LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
130	"for input charset [%s] ret %d ecnt %d\n",
131	m_dfltInputCharset.c_str(), ret, ecnt));
132	itext.erase();
133	return false;
134	}
135	m_metaData["origcharset"] = m_dfltInputCharset;	121	m_metaData[cstr_origcharset] = m_dfltInputCharset;
136	m_metaData[cstr_charset] = "utf-8";
137	m_metaData[cstr_mimetype] = cstr_textplain;	122	m_metaData[cstr_mimetype] = cstr_textplain;
138		123
		124	size_t srclen = m_text.length();
		125	m_metaData[cstr_content].swap(m_text);
		126
		127	// txtdcode() truncates the text if transcoding fails
		128	(void)txtdcode("mh_text");
		129
		130
139	// If text length is 0 (the file is empty or oversize), or we have	131	// If the text length is 0 (the file is empty or oversize), or we are
140	// read all at once, we're done	132	// not paging, we're done
141	if (m_text.length() == 0 \|\| !m_paging) {	133	if (srclen == 0 \|\| !m_paging) {
142	m_havedoc = false;	134	m_havedoc = false;
143	return true;	135	return true;
144	} else {	136	} else {
145	// Paging: set ipath then read next chunk.	137	// Paging: set ipath then read next chunk.
146		138
	...		...
148	// records for small files (one for the file, one for the	140	// records for small files (one for the file, one for the
149	// first chunk). This is a hack. The right thing to do would	141	// first chunk). This is a hack. The right thing to do would
150	// be to use a different mtype for files over the page size,	142	// be to use a different mtype for files over the page size,
151	// and keep text/plain only for smaller files.	143	// and keep text/plain only for smaller files.
152	char buf[30];	144	char buf[30];
153	sprintf(buf, "%lld", (long long)(m_offs - m_text.length()));	145	sprintf(buf, "%lld", (long long)(m_offs - srclen));
154	if (m_offs - m_text.length() != 0)	146	if (m_offs - srclen != 0)
155	m_metaData[cstr_ipath] = buf;	147	m_metaData[cstr_ipath] = buf;
156	readnext();	148	readnext();
157	return true;	149	return true;
158	}	150	}
159	}	151	}
160		152
161	bool MimeHandlerText::readnext()	153	bool MimeHandlerText::readnext()
162	{	154	{
163	string reason;	155	string reason;
164	m_text.erase();	156	m_text.clear();
165	if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {	157	if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
166	LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));	158	LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
167	m_havedoc = false;	159	m_havedoc = false;
168	return false;	160	return false;
169	}	161	}