本文整理汇总了C++中UNICHARSET::load_from_file方法的典型用法代码示例。如果您正苦于以下问题:C++ UNICHARSET::load_from_file方法的具体用法?C++ UNICHARSET::load_from_file怎么用?C++ UNICHARSET::load_from_file使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类UNICHARSET
的用法示例。
在下文中一共展示了UNICHARSET::load_from_file方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: SetPropertiesForInputFile
// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const std::string& script_dir,
const std::string& input_unicharset_file,
const std::string& output_unicharset_file,
const std::string& output_xheights_file) {
UNICHARSET unicharset;
// Load the input unicharset
unicharset.load_from_file(input_unicharset_file.c_str());
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
input_unicharset_file.c_str());
// Set unichar properties
tprintf("Setting unichar properties\n");
SetupBasicProperties(true, false, &unicharset);
tprintf("Setting script properties\n");
SetScriptProperties(script_dir, &unicharset);
if (!output_xheights_file.empty()) {
std::string xheights_str = GetXheightString(script_dir, unicharset);
File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
}
// Write the output unicharset
tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
unicharset.save_to_file(output_unicharset_file.c_str());
}
示例2: SetPropertiesForInputFile
// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const string& script_dir,
const string& input_unicharset_file,
const string& output_unicharset_file,
const string& output_xheights_file) {
UNICHARSET unicharset;
// Load the input unicharset
unicharset.load_from_file(input_unicharset_file.c_str());
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
input_unicharset_file.c_str());
// Set unichar properties
tprintf("Setting unichar properties\n");
SetupBasicProperties(true, false, &unicharset);
string xheights_str;
for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
// Load the unicharset for the script if available.
string filename = script_dir + "/" +
unicharset.get_script_from_script_id(s) + ".unicharset";
UNICHARSET script_set;
if (script_set.load_from_file(filename.c_str())) {
unicharset.SetPropertiesFromOther(script_set);
}
// Load the xheights for the script if available.
filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
".xheights";
string script_heights;
if (File::ReadFileToString(filename, &script_heights))
xheights_str += script_heights;
}
if (!output_xheights_file.empty())
File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
if (unicharset.PropertiesIncomplete(c)) {
tprintf("Warning: properties incomplete for index %d = %s\n",
c, unicharset.id_to_unichar(c));
}
}
// Write the output unicharset
tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
unicharset.save_to_file(output_unicharset_file.c_str());
}
示例3: main
int main(int argc, char** argv) {
// Sets properties on the input unicharset file, and writes:
// rootdir/lang/lang.charset_size=ddd.txt
// rootdir/lang/lang.traineddata
// rootdir/lang/lang.unicharset
// If the 3 word lists are provided, the dawgs are also added
// to the traineddata file.
// The output unicharset and charset_size files are just for
// human readability.
tesseract::CheckSharedLibraryVersion();
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
GenericVector<STRING> words, puncs, numbers;
// If these reads fail, we get a warning message and an empty list of words.
tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
// Load the input unicharset
UNICHARSET unicharset;
if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
tprintf("Failed to load unicharset from %s\n",
FLAGS_input_unicharset.c_str());
return 1;
}
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
FLAGS_input_unicharset.c_str());
// Set unichar properties
tprintf("Setting unichar properties\n");
tesseract::SetupBasicProperties(/*report_errors*/ true,
/*decompose (NFD)*/ false, &unicharset);
tprintf("Setting script properties\n");
tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
// Combine everything into a traineddata file.
return tesseract::CombineLangModel(
unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(),
FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder,
words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
/*writer*/ nullptr);
}
示例4: main
int main(int argc, char *argv[]) {
if (argc != 4) {
tprintf("Print all the words in a given dawg.\n");
tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n",
argv[0]);
return 1;
}
const char *unicharset_file = argv[1];
const char *dawg_file = argv[2];
const char *wordlist_file = argv[3];
UNICHARSET unicharset;
if (!unicharset.load_from_file(unicharset_file)) {
tprintf("Error loading unicharset from %s.\n", unicharset_file);
return 1;
}
tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file);
if (dict == NULL) {
tprintf("Error loading dictionary from %s.\n", dawg_file);
return 1;
}
int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file);
delete dict;
return retval;
}
示例5: Main
//.........这里部分代码省略.........
// unichars.
render.set_vertical_text(true);
render.set_gravity_hint_strong(true);
render.set_render_fullwidth_latin(true);
} else {
tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str());
exit(1);
}
std::string src_utf8;
// This c_str is NOT redundant!
if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) {
tprintf("Failed to read file: %s\n", FLAGS_text.c_str());
exit(1);
}
// Remove the unicode mark if present.
if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) {
src_utf8.erase(0, 3);
}
tlog(1, "Render string of size %d\n", src_utf8.length());
if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) {
// Try to preserve behavior of old text2image by expanding inter-word
// spaces by a factor of 4.
const std::string kSeparator = FLAGS_render_ngrams ? " " : " ";
// Also restrict the number of characters per line to try and avoid
// line-breaking in the middle of words like "-A", "R$" etc. which are
// otherwise allowed by the standard unicode line-breaking rules.
const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100;
std::string rand_utf8;
UNICHARSET unicharset;
if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() &&
!unicharset.load_from_file(FLAGS_unicharset_file.c_str())) {
tprintf("Failed to load unicharset from file %s\n",
FLAGS_unicharset_file.c_str());
exit(1);
}
// If we are rendering ngrams that will be OCRed later, shuffle them so that
// tesseract does not have difficulties finding correct baseline, word
// spaces, etc.
const char *str8 = src_utf8.c_str();
int len = src_utf8.length();
int step;
std::vector<std::pair<int, int> > offsets;
int offset = SpanUTF8Whitespace(str8);
while (offset < len) {
step = SpanUTF8NotWhitespace(str8 + offset);
offsets.push_back(std::make_pair(offset, step));
offset += step;
offset += SpanUTF8Whitespace(str8 + offset);
}
if (FLAGS_render_ngrams)
std::random_shuffle(offsets.begin(), offsets.end());
for (size_t i = 0, line = 1; i < offsets.size(); ++i) {
const char *curr_pos = str8 + offsets[i].first;
int ngram_len = offsets[i].second;
// Skip words that contain characters not in found in unicharset.
std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
if (!FLAGS_unicharset_file.empty() &&
!unicharset.encodable_string(cleaned.c_str(), nullptr)) {
continue;
}
rand_utf8.append(curr_pos, ngram_len);
示例6: main
// Apart from command-line flags, input is a collection of lstmf files, that
// were previously created using tesseract with the lstm.train config file.
// The program iterates over the inputs, feeding the data to the network,
// until the error rate reaches a specified target or max_iterations is reached.
int main(int argc, char **argv) {
ParseArguments(&argc, &argv);
// Purify the model name in case it is based on the network string.
if (FLAGS_model_output.empty()) {
tprintf("Must provide a --model_output!\n");
return 1;
}
STRING model_output = FLAGS_model_output.c_str();
for (int i = 0; i < model_output.length(); ++i) {
if (model_output[i] == '[' || model_output[i] == ']')
model_output[i] = '-';
if (model_output[i] == '(' || model_output[i] == ')')
model_output[i] = '_';
}
// Setup the trainer.
STRING checkpoint_file = FLAGS_model_output.c_str();
checkpoint_file += "_checkpoint";
STRING checkpoint_bak = checkpoint_file + ".bak";
tesseract::LSTMTrainer trainer(
NULL, NULL, NULL, NULL, FLAGS_model_output.c_str(),
checkpoint_file.c_str(), FLAGS_debug_interval,
static_cast<inT64>(FLAGS_max_image_MB) * 1048576);
// Reading something from an existing model doesn't require many flags,
// so do it now and exit.
if (FLAGS_stop_training || FLAGS_debug_network) {
if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) {
tprintf("Failed to read continue from: %s\n",
FLAGS_continue_from.c_str());
return 1;
}
if (FLAGS_debug_network) {
trainer.DebugNetwork();
} else {
if (FLAGS_train_mode & tesseract::TF_INT_MODE)
trainer.ConvertToInt();
GenericVector<char> recognizer_data;
trainer.SaveRecognitionDump(&recognizer_data);
if (!tesseract::SaveDataToFile(recognizer_data,
FLAGS_model_output.c_str())) {
tprintf("Failed to write recognition model : %s\n",
FLAGS_model_output.c_str());
}
}
return 0;
}
// Get the list of files to process.
if (FLAGS_train_listfile.empty()) {
tprintf("Must supply a list of training filenames! --train_listfile\n");
return 1;
}
GenericVector<STRING> filenames;
if (!tesseract::LoadFileLinesToStrings(FLAGS_train_listfile.c_str(),
&filenames)) {
tprintf("Failed to load list of training filenames from %s\n",
FLAGS_train_listfile.c_str());
return 1;
}
UNICHARSET unicharset;
// Checkpoints always take priority if they are available.
if (trainer.TryLoadingCheckpoint(checkpoint_file.string()) ||
trainer.TryLoadingCheckpoint(checkpoint_bak.string())) {
tprintf("Successfully restored trainer from %s\n",
checkpoint_file.string());
} else {
if (!FLAGS_continue_from.empty()) {
// Load a past model file to improve upon.
if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) {
tprintf("Failed to continue from: %s\n", FLAGS_continue_from.c_str());
return 1;
}
tprintf("Continuing from %s\n", FLAGS_continue_from.c_str());
trainer.InitIterations();
}
if (FLAGS_continue_from.empty() || FLAGS_append_index >= 0) {
// We need a unicharset to start from scratch or append.
string unicharset_str;
// Character coding to be used by the classifier.
if (!unicharset.load_from_file(FLAGS_U.c_str())) {
tprintf("Error: must provide a -U unicharset!\n");
return 1;
}
tesseract::SetupBasicProperties(true, &unicharset);
if (FLAGS_append_index >= 0) {
tprintf("Appending a new network to an old one!!");
if (FLAGS_continue_from.empty()) {
tprintf("Must set --continue_from for appending!\n");
return 1;
}
}
// We are initializing from scratch.
trainer.InitCharSet(unicharset, FLAGS_script_dir.c_str(),
FLAGS_train_mode);
if (!trainer.InitNetwork(FLAGS_net_spec.c_str(), FLAGS_append_index,
//.........这里部分代码省略.........