Lucene++ - a full-featured, c++ search engine
API Documentation


Loading...
Searching...
No Matches
NGramTokenizer.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef NGRAMTOKENIZER_H
8#define NGRAMTOKENIZER_H
9
10#include "Tokenizer.h"
11
12namespace Lucene {
13
19class LPPAPI NGramTokenizer : public Tokenizer {
20public:
21 static const int32_t DEFAULT_MIN_NGRAM_SIZE;
22 static const int32_t DEFAULT_MAX_NGRAM_SIZE;
23
24public:
26 NGramTokenizer(const ReaderPtr& input, int32_t minGram, int32_t maxGram);
27 NGramTokenizer(const AttributeSourcePtr& source, const ReaderPtr& input, int32_t minGram, int32_t maxGram);
28 NGramTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input, int32_t minGram, int32_t maxGram);
29
30 virtual ~NGramTokenizer();
31
33
34protected:
35 CharArray buffer;
36 int32_t bufferStart;
37 int32_t bufferEnd;
38 int32_t offset;
39 int32_t gramSize;
40 int32_t minGram;
41 int32_t maxGram;
45
49
50protected:
51 void init(int32_t minGram, int32_t maxGram);
52 void resetState();
53 void fillBuffer();
54 void consume();
56
58 virtual bool isTokenChar(wchar_t chr);
59
60public:
61 virtual bool incrementToken();
62 virtual void end();
63 virtual void reset();
64 virtual void reset(const ReaderPtr& input);
65};
66
67}
68
69#endif
#define LUCENE_CLASS(Name)
Definition LuceneObject.h:24
Tokenizes the input into n-grams of the given size(s).
Definition NGramTokenizer.h:19
int32_t bufferEnd
Definition NGramTokenizer.h:37
int32_t lastCheckedChar
Definition NGramTokenizer.h:43
OffsetAttributePtr offsetAtt
Definition NGramTokenizer.h:47
PositionIncrementAttributePtr posIncrAtt
Definition NGramTokenizer.h:48
bool exhausted
Definition NGramTokenizer.h:42
int32_t gramSize
Definition NGramTokenizer.h:39
int32_t offset
Definition NGramTokenizer.h:38
virtual bool incrementToken()
Consumers (ie., IndexWriter) use this method to advance the stream to the next token....
TermAttributePtr termAtt
Definition NGramTokenizer.h:46
virtual void reset()
Resets this stream to the beginning. This is an optional operation, so subclasses may or may not impl...
NGramTokenizer(const ReaderPtr &input)
int32_t bufferStart
Definition NGramTokenizer.h:36
int32_t maxGram
Definition NGramTokenizer.h:41
NGramTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input, int32_t minGram, int32_t maxGram)
int32_t minGram
Definition NGramTokenizer.h:40
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
CharArray buffer
Definition NGramTokenizer.h:35
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
static const int32_t DEFAULT_MAX_NGRAM_SIZE
Definition NGramTokenizer.h:22
NGramTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input, int32_t minGram, int32_t maxGram)
void init(int32_t minGram, int32_t maxGram)
virtual bool isTokenChar(wchar_t chr)
Only collect characters which satisfy this condition.
int32_t lastNonTokenChar
Definition NGramTokenizer.h:44
static const int32_t DEFAULT_MIN_NGRAM_SIZE
Definition NGramTokenizer.h:21
NGramTokenizer(const ReaderPtr &input, int32_t minGram, int32_t maxGram)
A Tokenizer is a TokenStream whose input is a Reader.
Definition Tokenizer.h:20
Definition AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition LuceneTypes.h:523
boost::shared_ptr< PositionIncrementAttribute > PositionIncrementAttributePtr
Definition LuceneTypes.h:48
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition LuceneTypes.h:61
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition LuceneTypes.h:43
boost::shared_ptr< Reader > ReaderPtr
Definition LuceneTypes.h:550
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition LuceneTypes.h:522

clucene.sourceforge.net