613 lines
19 KiB
C++
613 lines
19 KiB
C++
/****************************************************************************
|
|
**
|
|
** Copyright (C) 2015 The Qt Company Ltd.
|
|
** Contact: http://www.qt.io/licensing/
|
|
**
|
|
** This file is part of the Qt Assistant of the Qt Toolkit.
|
|
**
|
|
** $QT_BEGIN_LICENSE:LGPL$
|
|
** Commercial License Usage
|
|
** Licensees holding valid commercial Qt licenses may use this file in
|
|
** accordance with the commercial license agreement provided with the
|
|
** Software or, alternatively, in accordance with the terms contained in
|
|
** a written agreement between you and The Qt Company. For licensing terms
|
|
** and conditions see http://www.qt.io/terms-conditions. For further
|
|
** information use the contact form at http://www.qt.io/contact-us.
|
|
**
|
|
** GNU Lesser General Public License Usage
|
|
** Alternatively, this file may be used under the terms of the GNU Lesser
|
|
** General Public License version 2.1 or version 3 as published by the Free
|
|
** Software Foundation and appearing in the file LICENSE.LGPLv21 and
|
|
** LICENSE.LGPLv3 included in the packaging of this file. Please review the
|
|
** following information to ensure the GNU Lesser General Public License
|
|
** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
|
|
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
|
**
|
|
** As a special exception, The Qt Company gives you certain additional
|
|
** rights. These rights are described in The Qt Company LGPL Exception
|
|
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
|
**
|
|
** GNU General Public License Usage
|
|
** Alternatively, this file may be used under the terms of the GNU
|
|
** General Public License version 3.0 as published by the Free Software
|
|
** Foundation and appearing in the file LICENSE.GPL included in the
|
|
** packaging of this file. Please review the following information to
|
|
** ensure the GNU General Public License version 3.0 requirements will be
|
|
** met: http://www.gnu.org/copyleft/gpl.html.
|
|
**
|
|
** $QT_END_LICENSE$
|
|
**
|
|
****************************************************************************/
|
|
|
|
#include "qhelpenginecore.h"
|
|
#include "qhelpsearchindexreader_default_p.h"
|
|
|
|
#include <QtCore/QDir>
|
|
#include <QtCore/QUrl>
|
|
#include <QtCore/QFile>
|
|
#include <QtCore/QVariant>
|
|
#include <QtCore/QFileInfo>
|
|
#include <QtCore/QDataStream>
|
|
#include <QtCore/QTextStream>
|
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
namespace fulltextsearch {
|
|
namespace std {
|
|
|
|
namespace {
|
|
QStringList split( const QString &str )
|
|
{
|
|
QStringList lst;
|
|
int j = 0;
|
|
int i = str.indexOf(QLatin1Char('*'), j );
|
|
|
|
if (str.startsWith(QLatin1String("*")))
|
|
lst << QLatin1String("*");
|
|
|
|
while ( i != -1 ) {
|
|
if ( i > j && i <= (int)str.length() ) {
|
|
lst << str.mid( j, i - j );
|
|
lst << QLatin1String("*");
|
|
}
|
|
j = i + 1;
|
|
i = str.indexOf(QLatin1Char('*'), j );
|
|
}
|
|
|
|
int l = str.length() - 1;
|
|
if ( str.mid( j, l - j + 1 ).length() > 0 )
|
|
lst << str.mid( j, l - j + 1 );
|
|
|
|
return lst;
|
|
}
|
|
}
|
|
|
|
|
|
Reader::Reader()
|
|
: indexPath(QString())
|
|
, indexFile(QString())
|
|
, documentFile(QString())
|
|
{
|
|
termList.clear();
|
|
indexTable.clear();
|
|
searchIndexTable.clear();
|
|
}
|
|
|
|
Reader::~Reader()
|
|
{
|
|
reset();
|
|
searchIndexTable.clear();
|
|
}
|
|
|
|
bool Reader::readIndex()
|
|
{
|
|
if (indexTable.contains(indexFile))
|
|
return true;
|
|
|
|
QFile idxFile(indexFile);
|
|
if (!idxFile.open(QFile::ReadOnly))
|
|
return false;
|
|
|
|
QString key;
|
|
int numOfDocs;
|
|
EntryTable entryTable;
|
|
QVector<Document> docs;
|
|
QDataStream dictStream(&idxFile);
|
|
while (!dictStream.atEnd()) {
|
|
dictStream >> key;
|
|
dictStream >> numOfDocs;
|
|
docs.resize(numOfDocs);
|
|
dictStream >> docs;
|
|
entryTable.insert(key, new Entry(docs));
|
|
}
|
|
idxFile.close();
|
|
|
|
if (entryTable.isEmpty())
|
|
return false;
|
|
|
|
QFile docFile(documentFile);
|
|
if (!docFile.open(QFile::ReadOnly))
|
|
return false;
|
|
|
|
QString title, url;
|
|
DocumentList documentList;
|
|
QDataStream docStream(&docFile);
|
|
while (!docStream.atEnd()) {
|
|
docStream >> title;
|
|
docStream >> url;
|
|
documentList.append(QStringList(title) << url);
|
|
}
|
|
docFile.close();
|
|
|
|
if (documentList.isEmpty()) {
|
|
cleanupIndex(entryTable);
|
|
return false;
|
|
}
|
|
|
|
indexTable.insert(indexFile, Index(entryTable, documentList));
|
|
return true;
|
|
}
|
|
|
|
bool Reader::initCheck() const
|
|
{
|
|
return !searchIndexTable.isEmpty();
|
|
}
|
|
|
|
void Reader::setIndexPath(const QString &path)
|
|
{
|
|
indexPath = path;
|
|
}
|
|
|
|
void Reader::filterFilesForAttributes(const QStringList &attributes)
|
|
{
|
|
searchIndexTable.clear();
|
|
for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
|
|
const QString fileName = it.key();
|
|
bool containsAll = true;
|
|
QStringList split = fileName.split(QLatin1String("@"));
|
|
foreach (const QString &attribute, attributes) {
|
|
if (!split.contains(attribute, Qt::CaseInsensitive)) {
|
|
containsAll = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (containsAll)
|
|
searchIndexTable.insert(fileName, it.value());
|
|
}
|
|
}
|
|
|
|
void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
|
|
{
|
|
QString extension = namespaceName + QLatin1String("@") + attributes;
|
|
indexFile = indexPath + QLatin1String("/indexdb40.") + extension;
|
|
documentFile = indexPath + QLatin1String("/indexdoc40.") + extension;
|
|
}
|
|
|
|
bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
|
|
QStringList *termSeq, QStringList *seqWords)
|
|
{
|
|
QString term = searchTerm;
|
|
|
|
term = term.simplified();
|
|
term = term.replace(QLatin1String("\'"), QLatin1String("\""));
|
|
term = term.replace(QLatin1String("`"), QLatin1String("\""));
|
|
term = term.replace(QLatin1String("-"), QLatin1String(" "));
|
|
term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
|
|
|
|
*terms = term.split(QLatin1Char(' '));
|
|
QStringList::iterator it = terms->begin();
|
|
for (; it != terms->end(); ++it) {
|
|
(*it) = (*it).simplified();
|
|
(*it) = (*it).toLower();
|
|
(*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
|
|
}
|
|
|
|
if (term.contains(QLatin1Char('\"'))) {
|
|
if ((term.count(QLatin1Char('\"')))%2 == 0) {
|
|
int beg = 0;
|
|
int end = 0;
|
|
QString s;
|
|
beg = term.indexOf(QLatin1Char('\"'), beg);
|
|
while (beg != -1) {
|
|
beg++;
|
|
end = term.indexOf(QLatin1Char('\"'), beg);
|
|
s = term.mid(beg, end - beg);
|
|
s = s.toLower();
|
|
s = s.simplified();
|
|
if (s.contains(QLatin1Char('*'))) {
|
|
qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
|
|
return false;
|
|
}
|
|
*seqWords += s.split(QLatin1Char(' '));
|
|
*termSeq << s;
|
|
beg = term.indexOf(QLatin1Char('\"'), end + 1);
|
|
}
|
|
} else {
|
|
qWarning("Full Text Search, the closing quotation mark is missing.");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void Reader::searchInIndex(const QStringList &terms)
|
|
{
|
|
foreach (const QString &term, terms) {
|
|
QVector<Document> documents;
|
|
|
|
for(IndexTable::ConstIterator it = searchIndexTable.begin();
|
|
it != searchIndexTable.end(); ++it) {
|
|
EntryTable entryTable = it.value().first;
|
|
DocumentList documentList = it.value().second;
|
|
|
|
if (term.contains(QLatin1Char('*')))
|
|
documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
|
|
else if (entryTable.value(term))
|
|
documents = entryTable.value(term)->documents;
|
|
else
|
|
continue;
|
|
|
|
if (!documents.isEmpty()) {
|
|
DocumentInfo info;
|
|
QString title, url;
|
|
QVector<DocumentInfo> documentsInfo;
|
|
foreach(const Document &doc, documents) {
|
|
info.docNumber = doc.docNumber;
|
|
info.frequency = doc.frequency;
|
|
info.documentUrl = documentList.at(doc.docNumber).at(1);
|
|
info.documentTitle = documentList.at(doc.docNumber).at(0);
|
|
documentsInfo.append(info);
|
|
}
|
|
|
|
bool found = false;
|
|
for(QList<TermInfo>::Iterator tit = termList.begin();
|
|
tit != termList.end(); ++tit) {
|
|
TermInfo *t = &(*tit);
|
|
if(t->term == term) {
|
|
t->documents += documentsInfo;
|
|
t->frequency += documentsInfo.count();
|
|
found = true; break;
|
|
}
|
|
}
|
|
if (!found)
|
|
termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
|
|
}
|
|
}
|
|
}
|
|
qSort(termList);
|
|
}
|
|
|
|
QVector<DocumentInfo> Reader::hits()
|
|
{
|
|
QVector<DocumentInfo> documents;
|
|
if (!termList.count())
|
|
return documents;
|
|
|
|
documents = termList.takeFirst().documents;
|
|
for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
|
|
TermInfo *t = &(*it);
|
|
QVector<DocumentInfo> docs = t->documents;
|
|
for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
|
|
minDoc_it != documents.end(); ) {
|
|
bool found = false;
|
|
for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
|
|
doc_it != docs.constEnd(); ++doc_it ) {
|
|
if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
|
|
(*minDoc_it).frequency += (*doc_it).frequency;
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!found)
|
|
minDoc_it = documents.erase(minDoc_it);
|
|
else
|
|
++minDoc_it;
|
|
}
|
|
}
|
|
|
|
qSort(documents);
|
|
return documents;
|
|
}
|
|
|
|
bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
|
|
const QByteArray &data)
|
|
{
|
|
if (data.isEmpty())
|
|
return false;
|
|
|
|
for(QHash<QString, PosEntry*>::ConstIterator mit =
|
|
miniIndex.begin(); mit != miniIndex.end(); ++mit) {
|
|
delete mit.value();
|
|
}
|
|
miniIndex.clear();
|
|
|
|
wordNum = 3;
|
|
QStringList::ConstIterator cIt = words.begin();
|
|
for ( ; cIt != words.end(); ++cIt )
|
|
miniIndex.insert(*cIt, new PosEntry(0));
|
|
|
|
QTextStream s(data);
|
|
QString text = s.readAll();
|
|
bool valid = true;
|
|
const QChar *buf = text.unicode();
|
|
QChar str[64];
|
|
QChar c = buf[0];
|
|
int j = 0;
|
|
int i = 0;
|
|
while ( j < text.length() ) {
|
|
if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
|
|
valid = false;
|
|
if ( i > 1 )
|
|
buildMiniIndex( QString(str,i) );
|
|
i = 0;
|
|
c = buf[++j];
|
|
continue;
|
|
}
|
|
if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
|
|
valid = true;
|
|
c = buf[++j];
|
|
continue;
|
|
}
|
|
if ( !valid ) {
|
|
c = buf[++j];
|
|
continue;
|
|
}
|
|
if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
|
|
str[i] = c.toLower();
|
|
++i;
|
|
} else {
|
|
if ( i > 1 )
|
|
buildMiniIndex( QString(str,i) );
|
|
i = 0;
|
|
}
|
|
c = buf[++j];
|
|
}
|
|
if ( i > 1 )
|
|
buildMiniIndex( QString(str,i) );
|
|
|
|
QStringList::ConstIterator patIt = patterns.begin();
|
|
QStringList wordLst;
|
|
QList<uint> a, b;
|
|
QList<uint>::iterator aIt;
|
|
for ( ; patIt != patterns.end(); ++patIt ) {
|
|
wordLst = (*patIt).split(QLatin1Char(' '));
|
|
a = miniIndex[ wordLst[0] ]->positions;
|
|
for ( int j = 1; j < (int)wordLst.count(); ++j ) {
|
|
b = miniIndex[ wordLst[j] ]->positions;
|
|
aIt = a.begin();
|
|
while ( aIt != a.end() ) {
|
|
if ( b.contains( *aIt + 1 )) {
|
|
(*aIt)++;
|
|
++aIt;
|
|
} else {
|
|
aIt = a.erase( aIt );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( a.count() )
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
|
|
const EntryTable &entryTable)
|
|
{
|
|
QList<Term> termList;
|
|
for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
|
|
if (entryTable.value(*it)) {
|
|
Entry *e = entryTable.value(*it);
|
|
termList.append(Term(*it, e->documents.count(), e->documents ) );
|
|
}
|
|
}
|
|
QVector<Document> maxList(0);
|
|
if ( !termList.count() )
|
|
return maxList;
|
|
qSort(termList);
|
|
|
|
maxList = termList.takeLast().documents;
|
|
for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
|
|
Term *t = &(*it);
|
|
QVector<Document> docs = t->documents;
|
|
for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
|
|
if ( maxList.indexOf( *docIt ) == -1 )
|
|
maxList.append( *docIt );
|
|
}
|
|
}
|
|
return maxList;
|
|
}
|
|
|
|
QStringList Reader::getWildcardTerms(const QString &term,
|
|
const EntryTable &entryTable)
|
|
{
|
|
QStringList lst;
|
|
QStringList terms = split(term);
|
|
QStringList::Iterator iter;
|
|
|
|
for(EntryTable::ConstIterator it = entryTable.begin();
|
|
it != entryTable.end(); ++it) {
|
|
int index = 0;
|
|
bool found = false;
|
|
QString text( it.key() );
|
|
for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
|
|
if ( *iter == QLatin1String("*") ) {
|
|
found = true;
|
|
continue;
|
|
}
|
|
if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
|
|
found = false;
|
|
break;
|
|
}
|
|
index = text.indexOf( *iter, index );
|
|
if ( *iter == terms.last() && index != (int)text.length()-1 ) {
|
|
index = text.lastIndexOf( *iter );
|
|
if ( index != (int)text.length() - (int)(*iter).length() ) {
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
if ( index != -1 ) {
|
|
found = true;
|
|
index += (*iter).length();
|
|
continue;
|
|
} else {
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
if (found)
|
|
lst << text;
|
|
}
|
|
|
|
return lst;
|
|
}
|
|
|
|
void Reader::buildMiniIndex(const QString &string)
|
|
{
|
|
if (miniIndex[string])
|
|
miniIndex[string]->positions.append(wordNum);
|
|
++wordNum;
|
|
}
|
|
|
|
void Reader::reset()
|
|
{
|
|
for(IndexTable::Iterator it = indexTable.begin();
|
|
it != indexTable.end(); ++it) {
|
|
cleanupIndex(it.value().first);
|
|
it.value().second.clear();
|
|
}
|
|
}
|
|
|
|
void Reader::cleanupIndex(EntryTable &entryTable)
|
|
{
|
|
for(EntryTable::ConstIterator it =
|
|
entryTable.begin(); it != entryTable.end(); ++it) {
|
|
delete it.value();
|
|
}
|
|
|
|
entryTable.clear();
|
|
}
|
|
|
|
|
|
QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
|
|
: QHelpSearchIndexReader()
|
|
{
|
|
// nothing todo
|
|
}
|
|
|
|
QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
|
|
{
|
|
}
|
|
|
|
void QHelpSearchIndexReaderDefault::run()
|
|
{
|
|
mutex.lock();
|
|
|
|
if (m_cancel) {
|
|
mutex.unlock();
|
|
return;
|
|
}
|
|
|
|
const QList<QHelpSearchQuery> &queryList = this->m_query;
|
|
const QLatin1String key("DefaultSearchNamespaces");
|
|
const QString collectionFile(this->m_collectionFile);
|
|
const QString indexPath = m_indexFilesFolder;
|
|
|
|
mutex.unlock();
|
|
|
|
QString queryTerm;
|
|
foreach (const QHelpSearchQuery &query, queryList) {
|
|
if (query.fieldName == QHelpSearchQuery::DEFAULT) {
|
|
queryTerm = query.wordList.at(0);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (queryTerm.isEmpty())
|
|
return;
|
|
|
|
QHelpEngineCore engine(collectionFile, 0);
|
|
if (!engine.setupData())
|
|
return;
|
|
|
|
const QStringList registeredDocs = engine.registeredDocumentations();
|
|
const QStringList indexedNamespaces = engine.customValue(key).toString().
|
|
split(QLatin1String("|"), QString::SkipEmptyParts);
|
|
|
|
emit searchingStarted();
|
|
|
|
// setup the reader
|
|
m_reader.setIndexPath(indexPath);
|
|
foreach(const QString &namespaceName, registeredDocs) {
|
|
mutex.lock();
|
|
if (m_cancel) {
|
|
mutex.unlock();
|
|
searchingFinished(0); // TODO: check this ???
|
|
return;
|
|
}
|
|
mutex.unlock();
|
|
|
|
const QList<QStringList> attributeSets =
|
|
engine.filterAttributeSets(namespaceName);
|
|
|
|
foreach (const QStringList &attributes, attributeSets) {
|
|
// read all index files
|
|
m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
|
|
if (!m_reader.readIndex()) {
|
|
qWarning("Full Text Search, could not read file for namespace: %s.",
|
|
namespaceName.toUtf8().constData());
|
|
}
|
|
}
|
|
}
|
|
|
|
// get the current filter attributes and minimize the index files table
|
|
m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
|
|
|
|
hitList.clear();
|
|
QStringList terms, termSeq, seqWords;
|
|
if (m_reader.initCheck() && // check if we could read anything
|
|
m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
|
|
|
|
// search for term(s)
|
|
m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ???
|
|
|
|
QVector<DocumentInfo> hits = m_reader.hits();
|
|
if (!hits.isEmpty()) {
|
|
if (termSeq.isEmpty()) {
|
|
foreach (const DocumentInfo &docInfo, hits) {
|
|
mutex.lock();
|
|
if (m_cancel) {
|
|
mutex.unlock();
|
|
searchingFinished(0); // TODO: check this, speed issue while locking???
|
|
return;
|
|
}
|
|
mutex.unlock();
|
|
hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
|
|
}
|
|
} else {
|
|
foreach (const DocumentInfo &docInfo, hits) {
|
|
mutex.lock();
|
|
if (m_cancel) {
|
|
mutex.unlock();
|
|
searchingFinished(0); // TODO: check this, speed issue while locking???
|
|
return;
|
|
}
|
|
mutex.unlock();
|
|
|
|
if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
|
|
hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
emit searchingFinished(hitList.count());
|
|
}
|
|
|
|
} // namespace std
|
|
} // namespace fulltextsearch
|
|
|
|
QT_END_NAMESPACE
|