This file is indexed.

/usr/include/tesseract/tablerecog.h is in libtesseract-dev 3.02.01-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
///////////////////////////////////////////////////////////////////////
// File:        tablerecog.h
// Description: Functions to detect structure of tables.
// Author:    Nicholas Beato
// Created:   Aug 17, 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TABLERECOG_H_
#define TABLERECOG_H_

#include "colpartitiongrid.h"
#include "genericvector.h"

namespace tesseract {

// There are 2 classes in this file. They have 2 different purposes.
//  - StructuredTable contains the methods to find the structure given
//    a specific bounding box and grow that structure.
//  - TableRecognizer contains the methods to adjust the possible positions
//    of a table without worrying about structure.
//
// To use these classes, the assumption is that the TableFinder will
// have a guess of the location of a table (or possibly over/undersegmented
// tables). The TableRecognizer is responsible for finding the table boundaries
// at a high level. The StructuredTable class is responsible for determining
// the structure of the table and trying to maximize its bounds while retaining
// the structure.
// (The latter part is not implemented yet, but that was the goal).
//
// While on the boundary discussion, keep in mind that this is a first pass.
// There should eventually be some things like internal structure checks,
// and, more importantly, surrounding text flow checks.
//

// Usage:
// The StructuredTable class contains methods to query a potential table.
// It has functions to find structure, count rows, find ColPartitions that
// intersect gridlines, etc. It is not meant to blindly find a table. It
// is meant to start with a known table location and enhance it.
// Usage:
//    ColPartitionGrid text_grid, line_grid;  // init
//    TBOX table_box;  // known location of table location
//
//    StructuredTable table;
//    table.Init();  // construction code
//    table.set_text_grid(/* text */);  // These 2 grids can be the same!
//    table.set_line_grid(/* lines */);
//    table.set_min_text_height(10);    // Filter vertical and tall text.
//    // IMPORTANT! The table needs to be told where it is!
//    table.set_bounding_box(table_box);  // Set initial table location.
//    if (table.FindWhitespacedStructure()) {
//      // process table
//      table.column_count();  // number of columns
//      table.row_count();     // number of rows
//      table.cells_count();   // number of cells
//      table.bounding_box();  // updated bounding box
//      // etc.
//    }
//
class StructuredTable {
 public:
  StructuredTable();
  ~StructuredTable();

  // Initialization code. Must be called after the constructor.
  void Init();

  // Sets the grids used by the table. These can be changed between
  // calls to Recognize. They are treated as read-only data.
  void set_text_grid(ColPartitionGrid* text);
  void set_line_grid(ColPartitionGrid* lines);
  // Filters text partitions that are ridiculously tall to prevent
  // merging rows.
  void set_max_text_height(int height);

  // Basic accessors. Some are treated as attributes despite having indirect
  // representation.
  bool is_lined() const;
  int row_count() const;
  int column_count() const;
  int cell_count() const;
  void set_bounding_box(const TBOX& box);
  const TBOX& bounding_box() const;
  int median_cell_height();
  int median_cell_width();
  int row_height(int row) const;
  int column_width(int column) const;
  int space_above() const;
  int space_below() const;

  // Given enough horizontal and vertical lines in a region, create this table
  // based on the structure given by the lines. Return true if it worked out.
  // Code assumes the lines exist. It is the caller's responsibility to check
  // for lines and find an appropriate bounding box.
  bool FindLinedStructure();

  // The main subroutine for finding generic table structure. The function
  // finds the grid structure in the given box. Returns true if a good grid
  // exists, implying that "this" table is valid.
  bool FindWhitespacedStructure();

  ////////
  //////// Functions to query table info.
  ////////

  // Returns true if inserting part into the table does not cause any
  // cell merges.
  bool DoesPartitionFit(const ColPartition& part) const;
  // Checks if a sub-table has multiple data cells filled.
  int CountFilledCells();
  int CountFilledCellsInRow(int row);
  int CountFilledCellsInColumn(int column);
  int CountFilledCells(int row_start, int row_end,
                       int column_start, int column_end);

  // Makes sure that at least one cell in a row has substantial area filled.
  // This can filter out large whitespace caused by growing tables too far
  // and page numbers.
  // (currently bugged for some reason).
  bool VerifyRowFilled(int row);
  // Finds the filled area in a cell.
  double CalculateCellFilledPercentage(int row, int column);

  // Debug display, draws the table in the given color. If the table is not
  // valid, the table and "best" grid lines are still drawn in the given color.
  void Display(ScrollView* window, ScrollView::Color color);

 protected:
  // Clear the structure information.
  void ClearStructure();

  ////////
  //////// Lined tables
  ////////

  // Verifies the lines do not intersect partitions. This happens when
  // the lines are in column boundaries and extend the full page. As a result,
  // the grid lines go through column text. The condition is detectable.
  bool VerifyLinedTableCells();

  ////////
  //////// Tables with whitespace
  ////////

  // This is the function to change if you want to filter resulting tables
  // better. Right now it just checks for a minimum cell count and such.
  // You could add things like maximum number of ColPartitions per cell or
  // similar.
  bool VerifyWhitespacedTable();
  // Find the columns of a table using whitespace.
  void FindWhitespacedColumns();
  // Find the rows of a table using whitespace.
  void FindWhitespacedRows();

  ////////
  //////// Functions to provide information about the table.
  ////////

  // Calculates the whitespace around the table using the table boundary and
  // the supplied grids (set_text_grid and set_line_grid).
  void CalculateMargins();
  // Update the table margins with the supplied grid. This is
  // only called by calculate margins to use multiple grid sources.
  void UpdateMargins(ColPartitionGrid* grid);
  int FindVerticalMargin(ColPartitionGrid* grid, int start_x,
                         bool decrease) const;
  int FindHorizontalMargin(ColPartitionGrid* grid, int start_y,
                           bool decrease) const;
  // Calculates stats on the table, namely the median cell height and width.
  void CalculateStats();

  ////////
  //////// Functions to try to "fix" some table errors.
  ////////

  // Given a whitespaced table, this looks for bordering lines that might
  // be page layout boxes around the table. It is necessary to get the margins
  // correct on the table. If the lines are not joined, the margins will be
  // the distance to the line, which is not right.
  void AbsorbNearbyLines();

  // Nice utility function for finding partition gaps. You feed it a sorted
  // list of all of the mins/maxes of the partitions in the table, and it gives
  // you the gaps (middle). This works for both vertical and horizontal
  // gaps.
  //
  // If you want to allow slight overlap in the division and the partitions,
  // just scale down the partitions before inserting them in the list.
  // Likewise, you can force at least some space between partitions.
  // This trick is how the horizontal partitions are done (since the page
  // skew could make it hard to find splits in the text).
  //
  // As a result, "0 distance" between closest partitions causes a gap.
  // This is not a programmatic assumption. It is intentional and simplifies
  // things.
  //
  // "max_merged" indicates both the minimum number of stacked partitions
  // to cause a cell (add 1 to it), and the maximum number of partitions that
  // a grid line can intersect. For example, if max_merged is 0, then lines
  // are inserted wherever space exists between partitions. If it is 2,
  // lines may intersect 2 partitions at most, but you also need at least
  // 2 partitions to generate a line.
  static void FindCellSplitLocations(const GenericVector<int>& min_list,
                                     const GenericVector<int>& max_list,
                                     int max_merged,
                                     GenericVector<int>* locations);

  ////////
  //////// Utility function for table queries
  ////////

  // Counts the number of ColPartitions that intersect vertical cell
  // division at this x value. Used by VerifyLinedTable.
  int CountVerticalIntersections(int x);
  int CountHorizontalIntersections(int y);

  // Counts how many text partitions are in this box.
  int CountPartitions(const TBOX& box);

  ////////
  //////// Data members.
  ////////

  // Input data, used as read only data to make decisions.
  ColPartitionGrid* text_grid_;    // Text ColPartitions
  ColPartitionGrid* line_grid_;    // Line ColPartitions
  // Table structure.
  // bounding box is a convenient external representation.
  // cell_x_ and cell_y_ indicate the grid lines.
  TBOX bounding_box_;              // Bounding box
  GenericVectorEqEq<int> cell_x_;  // Locations of vertical divisions (sorted)
  GenericVectorEqEq<int> cell_y_;  // Locations of horizontal divisions (sorted)
  bool is_lined_;                  // Is the table backed up by a line structure
  // Table margins, set via CalculateMargins
  int space_above_;
  int space_below_;
  int space_left_;
  int space_right_;
  int median_cell_height_;
  int median_cell_width_;
  // Filters, used to prevent awkward partitions from destroying structure.
  int max_text_height_;
};

class TableRecognizer {
 public:
  TableRecognizer();
  ~TableRecognizer();

  // Initialization code. Must be called after the constructor.
  void Init();

  ////////
  //////// Pre-recognize methods to initial table constraints.
  ////////

  // Sets the grids used by the table. These can be changed between
  // calls to Recognize. They are treated as read-only data.
  void set_text_grid(ColPartitionGrid* text);
  void set_line_grid(ColPartitionGrid* lines);
  // Sets some additional constraints on the table.
  void set_min_height(int height);
  void set_min_width(int width);
  // Filters text partitions that are ridiculously tall to prevent
  // merging rows. Note that "filters" refers to allowing horizontal
  // cells to slice through them on the premise that they were
  // merged text rows during previous layout.
  void set_max_text_height(int height);

  // Given a guess location, the RecognizeTable function will try to find a
  // structured grid in the area. On success, it will return a new
  // StructuredTable (and assumes you will delete it). Otherwise,
  // NULL is returned.
  //
  // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
  // Ideally, there is a either a one-to-one correspondence between
  // the guess and table or no table at all. This is not the best of
  // assumptions right now, but was made to try to keep things simple in
  // the first pass.
  //
  // If a line structure is available on the page in the given region,
  // the table will use the linear structure as it is.
  // Otherwise, it will try to maximize the whitespace around it while keeping
  // a grid structure. This is somewhat working.
  //
  // Since the combination of adjustments can get high, effort was
  // originally made to keep the number of adjustments linear in the number
  // of partitions. The underlying structure finding code used to be
  // much more complex. I don't know how necessary this constraint is anymore.
  // The evaluation of a possible table is kept within O(nlogn) in the size of
  // the table (where size is the number of partitions in the table).
  // As a result, the algorithm is capable of O(n^2 log n). Depending
  // on the grid search size, it may be higher.
  //
  // Last note: it is possible to just try all partition boundaries at a high
  // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
  // area 200 partitions on a page, this could be too costly. Effort could go
  // into pruning the search, but I opted for something quicker. I'm confident
  // that the independent adjustments can get similar results and keep the
  // complextiy down. However, the other approach could work without using
  // TableFinder at all if it is fast enough.  It comes down to properly
  // deciding what is a table. The code currently relies on TableFinder's
  // guess to the location of a table for that.
  StructuredTable* RecognizeTable(const TBOX& guess_box);

 protected:
  ////////
  //////// Lined tables
  ////////

  // Returns true if the given box has a lined table within it. The
  // table argument will be updated with the table if the table exists.
  bool RecognizeLinedTable(const TBOX& guess_box, StructuredTable* table);
  // Returns true if the given box has a large number of horizontal and
  // vertical lines present. If so, we assume the extent of these lines
  // uniquely defines a table and find that table via SolveLinedTable.
  bool HasSignificantLines(const TBOX& guess);

  // Given enough horizontal and vertical lines in a region, find a bounding
  // box that encloses all of them (as well as newly introduced lines).
  // The bounding box is the smallest box that encloses the lines in guess
  // without having any lines sticking out of it.
  // bounding_box is an in/out parameter.
  // On input, it in the extents of the box to search.
  // On output, it is the resulting bounding box.
  bool FindLinesBoundingBox(TBOX* bounding_box);
  // Iteration in above search.
  // bounding_box is an in/out parameter.
  // On input, it in the extents of the box to search.
  // On output, it is the resulting bounding box.
  bool FindLinesBoundingBoxIteration(TBOX* bounding_box);

  ////////
  //////// Generic "whitespaced" tables
  ////////

  // Returns true if the given box has a whitespaced table within it. The
  // table argument will be updated if the table exists. Also note
  // that this method will fail if the guess_box center is not
  // mostly within the table.
  bool RecognizeWhitespacedTable(const TBOX& guess_box, StructuredTable* table);

  // Finds the location of a horizontal split relative to y.
  // This function is mostly unused now. If the SolveWhitespacedTable
  // changes much, it can be removed. Note, it isn't really as reliable
  // as I thought. I went with alternatives for most of the other uses.
  int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);

  // Indicates that a table row is weak. This means that it has
  // many missing data cells or very large cell heights compared.
  // to the rest of the table.
  static bool IsWeakTableRow(StructuredTable* table, int row);

  // Input data, used as read only data to make decisions.
  ColPartitionGrid* text_grid_;    // Text ColPartitions
  ColPartitionGrid* line_grid_;    // Line ColPartitions
  // Table constraints, a "good" table must satisfy these.
  int min_height_;
  int min_width_;
  // Filters, used to prevent awkward partitions from destroying structure.
  int max_text_height_;  // Horizontal lines may intersect taller text.
};

}  // namespace tesseract

#endif  /* TABLERECOG_H_ */