#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include <cstdlib>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/shared_ptr.hpp>
#include "utf8.h"
#define WIN32_LEAN_AND_MEAN
#define WIN32_EXTRA_LEAN
#define NOMINMAX
#include <Windows.h>
typedef boost::shared_ptr<std::wstring> strPtr;
bool parseTo(const wchar_t *&p, wchar_t delim, const wchar_t * const end)
{
while (*p != delim && p != end)
++p;
if (p == end)
return false;
return true;
}
bool parseToAndSkip(const wchar_t *&p, wchar_t delim, const wchar_t * const end)
{
if (!parseTo(p, delim, end))
return false;
if (++p == end)
return false;
return true;
}
wchar_t *copyOf(const wchar_t * const begin, const wchar_t * const end)
{
const size_t SIZE = end - begin;
wchar_t *ret = new wchar_t[SIZE + 1];
std::memcpy(ret, begin, SIZE * sizeof(wchar_t));
//std::copy(begin, end, ret);
ret[SIZE] = '\0';
return ret;
}
class DictionaryEntry
{
private:
DictionaryEntry(const DictionaryEntry&);
DictionaryEntry &operator=(const DictionaryEntry&);
public:
wchar_t *trad, *pinyin, *english;
DictionaryEntry(wchar_t *trad, wchar_t *pinyin, wchar_t *english):
trad(trad), pinyin(pinyin), english(english)
{
}
~DictionaryEntry()
{
delete[] trad;
delete[] pinyin;
delete[] english;
}
static DictionaryEntry *parse(const wchar_t *line, size_t len)
{
const wchar_t * const end = line + len;
const wchar_t *parseEnd = line;
if (!parseTo(parseEnd, L' ', end))
return NULL;
//strPtr trad(new std::wstring(line, parseEnd));
wchar_t *trad = copyOf(line, parseEnd);
line = parseEnd;
if (!parseToAndSkip(line, L'[', end))
return NULL;
parseEnd = line;
if (!parseTo(parseEnd, L']', end))
return NULL;
//strPtr pinyin(new std::wstring(line, parseEnd));
wchar_t *pinyin = copyOf(line, parseEnd);
line = parseEnd;
if (!parseToAndSkip(line, L'/', end))
return NULL;
parseEnd = end;
--parseEnd;
while (*parseEnd != '/' && parseEnd != line)
--parseEnd;
if (parseEnd == line)
return NULL;
//strPtr english(new std::wstring(line, parseEnd));
wchar_t *english = copyOf(line, parseEnd);
//return new DictionaryEntry(trad, pinyin, english);
return new DictionaryEntry(trad, pinyin, english);
}
};
class Dictionary
{
private:
boost::ptr_vector<DictionaryEntry> dict;
public:
Dictionary()
{
std::ifstream in;
in.open("cedict_ts.u8", std::ios::binary);
assert(in.good());
in.seekg(0, std::ios::end);
const size_t FILE_SIZE = in.tellg();
in.seekg(0, std::ios::beg);
char *buf = new char[FILE_SIZE];
char *bufEnd = buf + FILE_SIZE;
in.read(buf, FILE_SIZE);
wchar_t line[4096];
size_t lineLen = 0;
char *lineCur = buf;
while (lineCur < bufEnd)
{
wchar_t c = utf8::next(lineCur, bufEnd);
if (c == '\n' || lineCur == bufEnd)
{
// process line
if (lineLen > 0)
{
if (line[0] != '#') // comment
{
DictionaryEntry *de = DictionaryEntry::parse(line, lineLen);
if (de)
dict.push_back(de);
}
lineLen = 0;
}
}
else if (c != 13) // carriage return on windows
{
line[lineLen++] = c;
}
}
delete buf;
}
size_t length() const
{
return dict.size();
}
};
int main(int argc, char *argv[])
{
LARGE_INTEGER startTime, endTime, freq;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&startTime);
Dictionary dict;
QueryPerformanceCounter(&endTime);
std::cout << "length: " << dict.length() << "\n";
std::cout << "frequency: " << freq.QuadPart << "\n";
std::cout << "time: " << (endTime.QuadPart - startTime.QuadPart) / (double)freq.QuadPart << "s\n";
}