// Trigram filter
//
// Simplified code as example for a trigram filter.
//
// by feyd//godX.de
#include <stdint.h>
#include <vector>
#include <string>
#include <map>
#include <stdio.h>
// class holding trigrams and lists
class CTrigramFilter
{
public:
CTrigramFilter() : m_nDocument(-1) {}
virtual ~CTrigramFilter() {}
int AddDocument(const char* pDocument)
{
m_nDocument++;
size_t nLength = strlen(pDocument);
if(nLength<3)
return -1;
nLength -= 2;
for(size_t n=0; n<nLength; n++)
AddTrigramDocument(GetTrigram(&pDocument[n]));
return m_nDocument;
}
void FindDocument(const char* pSearch, std::vector<int>& output)
{
size_t nLength = strlen(pSearch);
if(nLength<3)
return;
nLength -= 2;
output.clear();
bool bFirst = true;
for(size_t n=0; n<nLength; n++)
{
std::map<int, std::vector<int> >::iterator it = m_trigrams.find(GetTrigram(&pSearch[n]));
if(it==m_trigrams.end())
{
output.clear();
return;
}
std::vector<int>& search = it->second;
if(bFirst)
{
bFirst = false;
output = search;
} else {
size_t nPosOutput = 0;
size_t nPosSearch = 0;
while(nPosOutput<output.size() && nPosSearch<search.size())
{
if(output[nPosOutput]<search[nPosSearch])
{
output.erase(output.begin()+nPosOutput);
} else if(output[nPosOutput]>search[nPosSearch]) {
nPosSearch++;
} else {
nPosSearch++;
nPosOutput++;
}
}
output.resize(nPosOutput);
}
}
}
protected:
int GetTrigram(const char* pTrigram)
{
return (((int)pTrigram[0]&0xff)<<16) | (((int)pTrigram[1]&0xff)<<8) | (((int)pTrigram[2]&0xff)<<0);
}
void AddTrigramDocument(int nTrigram)
{
std::vector<int>& documentList = m_trigrams[nTrigram];
if(documentList.size()>0)
if(documentList[documentList.size()-1]==m_nDocument)
return;
documentList.push_back(m_nDocument);
}
int m_nDocument;
std::map<int, std::vector<int> > m_trigrams;
};
// Application entry point (from libc)
int main(int argc, const char* argv[])
{
printf("Trigram filter test...\r\n\r\n");
if(argc>1)
{
CTrigramFilter trigramFilter;
printf(" Input:\r\n");
std::vector<const char*> strings(argc-2);
for(int n=0; n<argc-2; n++)
{
strings[n] = argv[n+1];
printf(" %d: %s\r\n", trigramFilter.AddDocument(strings[n]), strings[n]);
}
printf(" Filter:\r\n");
std::vector<int> search;
trigramFilter.FindDocument(argv[argc-1], search);
if(search.size()>0)
{
for(size_t n=0; n<search.size(); n++)
printf(" %d: %s\r\n", search[n], strings[search[n]]);
} else {
printf(" not found\r\n");
}
} else {
printf("not enough arguments\r\n");
}
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126