This file is indexed.

/usr/include/tesseract/recodebeam.h is in libtesseract-dev 4.00~git2288-10f4998a-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
///////////////////////////////////////////////////////////////////////
// File:        recodebeam.h
// Description: Beam search to decode from the re-encoded CJK as a sequence of
//              smaller numbers in place of a single large code.
// Author:      Ray Smith
// Created:     Fri Mar 13 09:12:01 PDT 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
#define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_

#include "dawg.h"
#include "dict.h"
#include "genericheap.h"
#include "kdpair.h"
#include "networkio.h"
#include "ratngs.h"
#include "unicharcompress.h"

namespace tesseract {

// Enum describing what can follow the current node.
// Consider the following softmax outputs:
// Timestep    0    1    2    3    4    5    6    7    8
// X-score    0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
// Y-score    0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
// Then the correct CTC decoding (in which adjacent equal classes are folded,
// and then all nulls are dropped) is clearly XYX, but simple decoding (taking
// the max at each timestep) leads to:
// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
// which folds to the correct XYX. The conversion to Tesseract rating and
// certainty uses the sum of the log probs (log of the product of probabilities)
// for the Rating and the minimum log prob for the certainty, but that yields a
// minimum certainty of log(0.55), which is poor for such an obvious case.
// CTC says that the probability of the result is the SUM of the products of the
// probabilities over ALL PATHS that decode to the same result, which includes:
// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
// That is intractable, so some compromise between simple and ideal is needed.
// Observing that evenly split timesteps rarely happen next to each other, we
// allow scores at a transition between classes to be added for decoding thus:
// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
// This works because NNX and NXX both decode to X, so in the middle we can use
// N+X. Note that the classes either side of a sum must stand alone, i.e. use a
// single score, to force all paths to pass through them and decode to the same
// result. Also in the special case of a transition from X to Y, with only one
// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
// decode to XY.
// An important condition is that we cannot combine X and Null between two
// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
// X and Null have to go in separate paths. Combining scores in this way
// provides a much better minimum certainty of log(0.95).
// In the implementation of the beam search, we have to place the possibilities
// X, X+N and X+Y+N in the beam under appropriate conditions of the previous
// node, and constrain what can follow, to enforce the rules explained above.
// We therefore have 3 different types of node determined by what can follow:
enum NodeContinuation {
  NC_ANYTHING,  // This node used just its own score, so anything can follow.
  NC_ONLY_DUP,  // The current node combined another score with the score for
                // itself, without a stand-alone duplicate before, so must be
                // followed by a stand-alone duplicate.
  NC_NO_DUP,    // The current node combined another score with the score for
                // itself, after a stand-alone, so can only be followed by
                // something other than a duplicate of the current node.
  NC_COUNT
};

// Enum describing the top-n status of a code.
enum TopNState {
  TN_TOP2,      // Winner or 2nd.
  TN_TOPN,      // Runner up in top-n, but not 1st or 2nd.
  TN_ALSO_RAN,  // Not in the top-n.
  TN_COUNT
};

// Lattice element for Re-encode beam search.
struct RecodeNode {
  RecodeNode()
      : code(-1),
        unichar_id(INVALID_UNICHAR_ID),
        permuter(TOP_CHOICE_PERM),
        start_of_dawg(false),
        start_of_word(false),
        end_of_word(false),
        duplicate(false),
        certainty(0.0f),
        score(0.0f),
        prev(nullptr),
        dawgs(nullptr),
        code_hash(0) {}
  RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
             bool word_start, bool end, bool dup, float cert, float s,
             const RecodeNode* p, DawgPositionVector* d, uint64_t hash)
      : code(c),
        unichar_id(uni_id),
        permuter(perm),
        start_of_dawg(dawg_start),
        start_of_word(word_start),
        end_of_word(end),
        duplicate(dup),
        certainty(cert),
        score(s),
        prev(p),
        dawgs(d),
        code_hash(hash) {}
  // NOTE: If we could use C++11, then this would be a move constructor.
  // Instead we have copy constructor that does a move!! This is because we
  // don't want to copy the whole DawgPositionVector each time, and true
  // copying isn't necessary for this struct. It does get moved around a lot
  // though inside the heap and during heap push, hence the move semantics.
  RecodeNode(RecodeNode& src) : dawgs(nullptr) {
    *this = src;
    ASSERT_HOST(src.dawgs == nullptr);
  }
  RecodeNode& operator=(RecodeNode& src) {
    delete dawgs;
    memcpy(this, &src, sizeof(src));
    src.dawgs = nullptr;
    return *this;
  }
  ~RecodeNode() { delete dawgs; }
  // Prints details of the node.
  void Print(int null_char, const UNICHARSET& unicharset, int depth) const;

  // The re-encoded code here = index to network output.
  int code;
  // The decoded unichar_id is only valid for the final code of a sequence.
  int unichar_id;
  // The type of permuter active at this point. Intervals between start_of_word
  // and end_of_word make valid words of type given by permuter where
  // end_of_word is true. These aren't necessarily delimited by spaces.
  PermuterType permuter;
  // True if this is the initial dawg state. May be attached to a space or,
  // in a non-space-delimited lang, the end of the previous word.
  bool start_of_dawg;
  // True if this is the first node in a dictionary word.
  bool start_of_word;
  // True if this represents a valid candidate end of word position. Does not
  // necessarily mark the end of a word, since a word can be extended beyond a
  // candidate end by a continuation, eg 'the' continues to 'these'.
  bool end_of_word;
  // True if this->code is a duplicate of prev->code. Some training modes
  // allow the network to output duplicate characters and crush them with CTC,
  // but that would mess up the dictionary search, so we just smash them
  // together on the fly using the duplicate flag.
  bool duplicate;
  // Certainty (log prob) of (just) this position.
  float certainty;
  // Total certainty of the path to this position.
  float score;
  // The previous node in this chain. Borrowed pointer.
  const RecodeNode* prev;
  // The currently active dawgs at this position. Owned pointer.
  DawgPositionVector* dawgs;
  // A hash of all codes in the prefix and this->code as well. Used for
  // duplicate path removal.
  uint64_t code_hash;
};

typedef KDPairInc<double, RecodeNode> RecodePair;
typedef GenericHeap<RecodePair> RecodeHeap;

// Class that holds the entire beam search for recognition of a text line.
class RecodeBeamSearch {
 public:
  // Borrows the pointer, which is expected to survive until *this is deleted.
  RecodeBeamSearch(const UnicharCompress& recoder, int null_char,
                   bool simple_text, Dict* dict);

  // Decodes the set of network outputs, storing the lattice internally.
  // If charset is not null, it enables detailed debugging of the beam search.
  void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
              double worst_dict_cert, const UNICHARSET* charset);
  void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
              double cert_offset, double worst_dict_cert,
              const UNICHARSET* charset);

  // Returns the best path as labels/scores/xcoords similar to simple CTC.
  void ExtractBestPathAsLabels(GenericVector<int>* labels,
                               GenericVector<int>* xcoords) const;
  // Returns the best path as unichar-ids/certs/ratings/xcoords skipping
  // duplicates, nulls and intermediate parts.
  void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET* unicharset,
                                   GenericVector<int>* unichar_ids,
                                   GenericVector<float>* certs,
                                   GenericVector<float>* ratings,
                                   GenericVector<int>* xcoords) const;

  // Returns the best path as a set of WERD_RES.
  void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
                              bool debug, const UNICHARSET* unicharset,
                              PointerVector<WERD_RES>* words);

  // Generates debug output of the content of the beams after a Decode.
  void DebugBeams(const UNICHARSET& unicharset) const;

  // Clipping value for certainty inside Tesseract. Reflects the minimum value
  // of certainty that will be returned by ExtractBestPathAsUnicharIds.
  // Supposedly on a uniform scale that can be compared across languages and
  // engines.
  static const float kMinCertainty;
  // Number of different code lengths for which we have a separate beam.
  static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;
  // Total number of beams: dawg/nodawg * number of NodeContinuation * number
  // of different lengths.
  static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
  // Returns the relevant factor in the beams_ index.
  static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
  static NodeContinuation ContinuationFromBeamsIndex(int index) {
    return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
  }
  static bool IsDawgFromBeamsIndex(int index) {
    return index / (kNumLengths * NC_COUNT) > 0;
  }
  // Computes a beams_ index from the given factors.
  static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
    return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
  }

 private:
  // Struct for the Re-encode beam search. This struct holds the data for
  // a single time-step position of the output. Use a PointerVector<RecodeBeam>
  // to hold all the timesteps and prevent reallocation of the individual heaps.
  struct RecodeBeam {
    // Resets to the initial state without deleting all the memory.
    void Clear() {
      for (int i = 0; i < kNumBeams; ++i) {
        beams_[i].clear();
      }
      RecodeNode empty;
      for (int i = 0; i < NC_COUNT; ++i) {
        best_initial_dawgs_[i] = empty;
      }
    }

    // A separate beam for each combination of code length,
    // NodeContinuation, and dictionary flag. Separating out all these types
    // allows the beam to be quite narrow, and yet still have a low chance of
    // losing the best path.
    // We have to keep all these beams separate, since the highest scoring paths
    // come from the paths that are most likely to dead-end at any time, like
    // dawg paths, NC_ONLY_DUP etc.
    // Each heap is stored with the WORST result at the top, so we can quickly
    // get the top-n values.
    RecodeHeap beams_[kNumBeams];
    // While the language model is only a single word dictionary, we can use
    // word starts as a choke point in the beam, and keep only a single dict
    // start node at each step (for each NodeContinuation type), so we find the
    // best one here and push it on the heap, if it qualifies, after processing
    // all of the step.
    RecodeNode best_initial_dawgs_[NC_COUNT];
  };
  typedef KDPairInc<float, int> TopPair;

  // Generates debug output of the content of a single beam position.
  void DebugBeamPos(const UNICHARSET& unicharset, const RecodeHeap& heap) const;

  // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
  // duplicates, nulls and intermediate parts.
  static void ExtractPathAsUnicharIds(
      const GenericVector<const RecodeNode*>& best_nodes,
      GenericVector<int>* unichar_ids, GenericVector<float>* certs,
      GenericVector<float>* ratings, GenericVector<int>* xcoords);

  // Sets up a word with the ratings matrix and fake blobs with boxes in the
  // right places.
  WERD_RES* InitializeWord(bool leading_space, const TBOX& line_box,
                           int word_start, int word_end, float space_certainty,
                           const UNICHARSET* unicharset,
                           const GenericVector<int>& xcoords,
                           float scale_factor);

  // Fills top_n_flags_ with bools that are true iff the corresponding output
  // is one of the top_n.
  void ComputeTopN(const float* outputs, int num_outputs, int top_n);

  // Adds the computation for the current time-step to the beam. Call at each
  // time-step in sequence from left to right. outputs is the activation vector
  // for the current timestep.
  void DecodeStep(const float* outputs, int t, double dict_ratio,
                  double cert_offset, double worst_dict_cert,
                  const UNICHARSET* charset);

  // Adds to the appropriate beams the legal (according to recoder)
  // continuations of context prev, which is from the given index to beams_,
  // using the given network outputs to provide scores to the choices. Uses only
  // those choices for which top_n_flags[code] == top_n_flag.
  void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
                       TopNState top_n_flag, double dict_ratio,
                       double cert_offset, double worst_dict_cert,
                       RecodeBeam* step);
  // Continues for a new unichar, using dawg or non-dawg as per flag.
  void ContinueUnichar(int code, int unichar_id, float cert,
                       float worst_dict_cert, float dict_ratio, bool use_dawgs,
                       NodeContinuation cont, const RecodeNode* prev,
                       RecodeBeam* step);
  // Adds a RecodeNode composed of the args to the correct heap in step if
  // unichar_id is a valid dictionary continuation of whatever is in prev.
  void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
                    const RecodeNode* prev, RecodeBeam* step);
  // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
  // if better than what is already there.
  void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
                               bool start, bool end, float cert,
                               NodeContinuation cont, const RecodeNode* prev,
                               RecodeBeam* step);
  // Adds a RecodeNode composed of the args to the correct heap in step for
  // partial unichar or duplicate if there is room or if better than the
  // current worst element if already full.
  void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
                               float cert, float worst_dict_cert,
                               float dict_ratio, bool use_dawgs,
                               NodeContinuation cont, const RecodeNode* prev,
                               RecodeBeam* step);
  // Adds a RecodeNode composed of the args to the correct heap in step if there
  // is room or if better than the current worst element if already full.
  void PushHeapIfBetter(int max_size, int code, int unichar_id,
                        PermuterType permuter, bool dawg_start, bool word_start,
                        bool end, bool dup, float cert, const RecodeNode* prev,
                        DawgPositionVector* d, RecodeHeap* heap);
  // Adds a RecodeNode to heap if there is room
  // or if better than the current worst element if already full.
  void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
  // Searches the heap for an entry matching new_node, and updates the entry
  // with reshuffle if needed. Returns true if there was a match.
  bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
  // Computes and returns the code-hash for the given code and prev.
  uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
  // Backtracks to extract the best path through the lattice that was built
  // during Decode. On return the best_nodes vector essentially contains the set
  // of code, score pairs that make the optimal path with the constraint that
  // the recoder can decode the code sequence back to a sequence of unichar-ids.
  void ExtractBestPaths(GenericVector<const RecodeNode*>* best_nodes,
                        GenericVector<const RecodeNode*>* second_nodes) const;
  // Helper backtracks through the lattice from the given node, storing the
  // path and reversing it.
  void ExtractPath(const RecodeNode* node,
                   GenericVector<const RecodeNode*>* path) const;
  // Helper prints debug information on the given lattice path.
  void DebugPath(const UNICHARSET* unicharset,
                 const GenericVector<const RecodeNode*>& path) const;
  // Helper prints debug information on the given unichar path.
  void DebugUnicharPath(const UNICHARSET* unicharset,
                        const GenericVector<const RecodeNode*>& path,
                        const GenericVector<int>& unichar_ids,
                        const GenericVector<float>& certs,
                        const GenericVector<float>& ratings,
                        const GenericVector<int>& xcoords) const;

  static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1];

  // The encoder/decoder that we will be using.
  const UnicharCompress& recoder_;
  // The beam for each timestep in the output.
  PointerVector<RecodeBeam> beam_;
  // The number of timesteps valid in beam_;
  int beam_size_;
  // A flag to indicate which outputs are the top-n choices. Current timestep
  // only.
  GenericVector<TopNState> top_n_flags_;
  // A record of the highest and second scoring codes.
  int top_code_;
  int second_code_;
  // Heap used to compute the top_n_flags_.
  GenericHeap<TopPair> top_heap_;
  // Borrowed pointer to the dictionary to use in the search.
  Dict* dict_;
  // True if the language is space-delimited, which is true for most languages
  // except chi*, jpn, tha.
  bool space_delimited_;
  // True if the input is simple text, ie adjacent equal chars are not to be
  // eliminated.
  bool is_simple_text_;
  // The encoded (class label) of the null/reject character.
  int null_char_;
};

}  // namespace tesseract.

#endif  // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_