/usr/include/tesseract/tablerecog.h is in libtesseract-dev 3.02.01-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 | ///////////////////////////////////////////////////////////////////////
// File: tablerecog.h
// Description: Functions to detect structure of tables.
// Author: Nicholas Beato
// Created: Aug 17, 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TABLERECOG_H_
#define TABLERECOG_H_
#include "colpartitiongrid.h"
#include "genericvector.h"
namespace tesseract {
// There are 2 classes in this file. They have 2 different purposes.
// - StructuredTable contains the methods to find the structure given
// a specific bounding box and grow that structure.
// - TableRecognizer contains the methods to adjust the possible positions
// of a table without worrying about structure.
//
// To use these classes, the assumption is that the TableFinder will
// have a guess of the location of a table (or possibly over/undersegmented
// tables). The TableRecognizer is responsible for finding the table boundaries
// at a high level. The StructuredTable class is responsible for determining
// the structure of the table and trying to maximize its bounds while retaining
// the structure.
// (The latter part is not implemented yet, but that was the goal).
//
// While on the boundary discussion, keep in mind that this is a first pass.
// There should eventually be some things like internal structure checks,
// and, more importantly, surrounding text flow checks.
//
// Usage:
// The StructuredTable class contains methods to query a potential table.
// It has functions to find structure, count rows, find ColPartitions that
// intersect gridlines, etc. It is not meant to blindly find a table. It
// is meant to start with a known table location and enhance it.
// Usage:
// ColPartitionGrid text_grid, line_grid; // init
// TBOX table_box; // known location of table location
//
// StructuredTable table;
// table.Init(); // construction code
// table.set_text_grid(/* text */); // These 2 grids can be the same!
// table.set_line_grid(/* lines */);
// table.set_min_text_height(10); // Filter vertical and tall text.
// // IMPORTANT! The table needs to be told where it is!
// table.set_bounding_box(table_box); // Set initial table location.
// if (table.FindWhitespacedStructure()) {
// // process table
// table.column_count(); // number of columns
// table.row_count(); // number of rows
// table.cells_count(); // number of cells
// table.bounding_box(); // updated bounding box
// // etc.
// }
//
class StructuredTable {
public:
StructuredTable();
~StructuredTable();
// Initialization code. Must be called after the constructor.
void Init();
// Sets the grids used by the table. These can be changed between
// calls to Recognize. They are treated as read-only data.
void set_text_grid(ColPartitionGrid* text);
void set_line_grid(ColPartitionGrid* lines);
// Filters text partitions that are ridiculously tall to prevent
// merging rows.
void set_max_text_height(int height);
// Basic accessors. Some are treated as attributes despite having indirect
// representation.
bool is_lined() const;
int row_count() const;
int column_count() const;
int cell_count() const;
void set_bounding_box(const TBOX& box);
const TBOX& bounding_box() const;
int median_cell_height();
int median_cell_width();
int row_height(int row) const;
int column_width(int column) const;
int space_above() const;
int space_below() const;
// Given enough horizontal and vertical lines in a region, create this table
// based on the structure given by the lines. Return true if it worked out.
// Code assumes the lines exist. It is the caller's responsibility to check
// for lines and find an appropriate bounding box.
bool FindLinedStructure();
// The main subroutine for finding generic table structure. The function
// finds the grid structure in the given box. Returns true if a good grid
// exists, implying that "this" table is valid.
bool FindWhitespacedStructure();
////////
//////// Functions to query table info.
////////
// Returns true if inserting part into the table does not cause any
// cell merges.
bool DoesPartitionFit(const ColPartition& part) const;
// Checks if a sub-table has multiple data cells filled.
int CountFilledCells();
int CountFilledCellsInRow(int row);
int CountFilledCellsInColumn(int column);
int CountFilledCells(int row_start, int row_end,
int column_start, int column_end);
// Makes sure that at least one cell in a row has substantial area filled.
// This can filter out large whitespace caused by growing tables too far
// and page numbers.
// (currently bugged for some reason).
bool VerifyRowFilled(int row);
// Finds the filled area in a cell.
double CalculateCellFilledPercentage(int row, int column);
// Debug display, draws the table in the given color. If the table is not
// valid, the table and "best" grid lines are still drawn in the given color.
void Display(ScrollView* window, ScrollView::Color color);
protected:
// Clear the structure information.
void ClearStructure();
////////
//////// Lined tables
////////
// Verifies the lines do not intersect partitions. This happens when
// the lines are in column boundaries and extend the full page. As a result,
// the grid lines go through column text. The condition is detectable.
bool VerifyLinedTableCells();
////////
//////// Tables with whitespace
////////
// This is the function to change if you want to filter resulting tables
// better. Right now it just checks for a minimum cell count and such.
// You could add things like maximum number of ColPartitions per cell or
// similar.
bool VerifyWhitespacedTable();
// Find the columns of a table using whitespace.
void FindWhitespacedColumns();
// Find the rows of a table using whitespace.
void FindWhitespacedRows();
////////
//////// Functions to provide information about the table.
////////
// Calculates the whitespace around the table using the table boundary and
// the supplied grids (set_text_grid and set_line_grid).
void CalculateMargins();
// Update the table margins with the supplied grid. This is
// only called by calculate margins to use multiple grid sources.
void UpdateMargins(ColPartitionGrid* grid);
int FindVerticalMargin(ColPartitionGrid* grid, int start_x,
bool decrease) const;
int FindHorizontalMargin(ColPartitionGrid* grid, int start_y,
bool decrease) const;
// Calculates stats on the table, namely the median cell height and width.
void CalculateStats();
////////
//////// Functions to try to "fix" some table errors.
////////
// Given a whitespaced table, this looks for bordering lines that might
// be page layout boxes around the table. It is necessary to get the margins
// correct on the table. If the lines are not joined, the margins will be
// the distance to the line, which is not right.
void AbsorbNearbyLines();
// Nice utility function for finding partition gaps. You feed it a sorted
// list of all of the mins/maxes of the partitions in the table, and it gives
// you the gaps (middle). This works for both vertical and horizontal
// gaps.
//
// If you want to allow slight overlap in the division and the partitions,
// just scale down the partitions before inserting them in the list.
// Likewise, you can force at least some space between partitions.
// This trick is how the horizontal partitions are done (since the page
// skew could make it hard to find splits in the text).
//
// As a result, "0 distance" between closest partitions causes a gap.
// This is not a programmatic assumption. It is intentional and simplifies
// things.
//
// "max_merged" indicates both the minimum number of stacked partitions
// to cause a cell (add 1 to it), and the maximum number of partitions that
// a grid line can intersect. For example, if max_merged is 0, then lines
// are inserted wherever space exists between partitions. If it is 2,
// lines may intersect 2 partitions at most, but you also need at least
// 2 partitions to generate a line.
static void FindCellSplitLocations(const GenericVector<int>& min_list,
const GenericVector<int>& max_list,
int max_merged,
GenericVector<int>* locations);
////////
//////// Utility function for table queries
////////
// Counts the number of ColPartitions that intersect vertical cell
// division at this x value. Used by VerifyLinedTable.
int CountVerticalIntersections(int x);
int CountHorizontalIntersections(int y);
// Counts how many text partitions are in this box.
int CountPartitions(const TBOX& box);
////////
//////// Data members.
////////
// Input data, used as read only data to make decisions.
ColPartitionGrid* text_grid_; // Text ColPartitions
ColPartitionGrid* line_grid_; // Line ColPartitions
// Table structure.
// bounding box is a convenient external representation.
// cell_x_ and cell_y_ indicate the grid lines.
TBOX bounding_box_; // Bounding box
GenericVectorEqEq<int> cell_x_; // Locations of vertical divisions (sorted)
GenericVectorEqEq<int> cell_y_; // Locations of horizontal divisions (sorted)
bool is_lined_; // Is the table backed up by a line structure
// Table margins, set via CalculateMargins
int space_above_;
int space_below_;
int space_left_;
int space_right_;
int median_cell_height_;
int median_cell_width_;
// Filters, used to prevent awkward partitions from destroying structure.
int max_text_height_;
};
class TableRecognizer {
public:
TableRecognizer();
~TableRecognizer();
// Initialization code. Must be called after the constructor.
void Init();
////////
//////// Pre-recognize methods to initial table constraints.
////////
// Sets the grids used by the table. These can be changed between
// calls to Recognize. They are treated as read-only data.
void set_text_grid(ColPartitionGrid* text);
void set_line_grid(ColPartitionGrid* lines);
// Sets some additional constraints on the table.
void set_min_height(int height);
void set_min_width(int width);
// Filters text partitions that are ridiculously tall to prevent
// merging rows. Note that "filters" refers to allowing horizontal
// cells to slice through them on the premise that they were
// merged text rows during previous layout.
void set_max_text_height(int height);
// Given a guess location, the RecognizeTable function will try to find a
// structured grid in the area. On success, it will return a new
// StructuredTable (and assumes you will delete it). Otherwise,
// NULL is returned.
//
// Keep in mind, this may "overgrow" or "undergrow" the size of guess.
// Ideally, there is a either a one-to-one correspondence between
// the guess and table or no table at all. This is not the best of
// assumptions right now, but was made to try to keep things simple in
// the first pass.
//
// If a line structure is available on the page in the given region,
// the table will use the linear structure as it is.
// Otherwise, it will try to maximize the whitespace around it while keeping
// a grid structure. This is somewhat working.
//
// Since the combination of adjustments can get high, effort was
// originally made to keep the number of adjustments linear in the number
// of partitions. The underlying structure finding code used to be
// much more complex. I don't know how necessary this constraint is anymore.
// The evaluation of a possible table is kept within O(nlogn) in the size of
// the table (where size is the number of partitions in the table).
// As a result, the algorithm is capable of O(n^2 log n). Depending
// on the grid search size, it may be higher.
//
// Last note: it is possible to just try all partition boundaries at a high
// level O(n^4) and do a verification scheme (at least O(nlogn)). If there
// area 200 partitions on a page, this could be too costly. Effort could go
// into pruning the search, but I opted for something quicker. I'm confident
// that the independent adjustments can get similar results and keep the
// complextiy down. However, the other approach could work without using
// TableFinder at all if it is fast enough. It comes down to properly
// deciding what is a table. The code currently relies on TableFinder's
// guess to the location of a table for that.
StructuredTable* RecognizeTable(const TBOX& guess_box);
protected:
////////
//////// Lined tables
////////
// Returns true if the given box has a lined table within it. The
// table argument will be updated with the table if the table exists.
bool RecognizeLinedTable(const TBOX& guess_box, StructuredTable* table);
// Returns true if the given box has a large number of horizontal and
// vertical lines present. If so, we assume the extent of these lines
// uniquely defines a table and find that table via SolveLinedTable.
bool HasSignificantLines(const TBOX& guess);
// Given enough horizontal and vertical lines in a region, find a bounding
// box that encloses all of them (as well as newly introduced lines).
// The bounding box is the smallest box that encloses the lines in guess
// without having any lines sticking out of it.
// bounding_box is an in/out parameter.
// On input, it in the extents of the box to search.
// On output, it is the resulting bounding box.
bool FindLinesBoundingBox(TBOX* bounding_box);
// Iteration in above search.
// bounding_box is an in/out parameter.
// On input, it in the extents of the box to search.
// On output, it is the resulting bounding box.
bool FindLinesBoundingBoxIteration(TBOX* bounding_box);
////////
//////// Generic "whitespaced" tables
////////
// Returns true if the given box has a whitespaced table within it. The
// table argument will be updated if the table exists. Also note
// that this method will fail if the guess_box center is not
// mostly within the table.
bool RecognizeWhitespacedTable(const TBOX& guess_box, StructuredTable* table);
// Finds the location of a horizontal split relative to y.
// This function is mostly unused now. If the SolveWhitespacedTable
// changes much, it can be removed. Note, it isn't really as reliable
// as I thought. I went with alternatives for most of the other uses.
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
// Indicates that a table row is weak. This means that it has
// many missing data cells or very large cell heights compared.
// to the rest of the table.
static bool IsWeakTableRow(StructuredTable* table, int row);
// Input data, used as read only data to make decisions.
ColPartitionGrid* text_grid_; // Text ColPartitions
ColPartitionGrid* line_grid_; // Line ColPartitions
// Table constraints, a "good" table must satisfy these.
int min_height_;
int min_width_;
// Filters, used to prevent awkward partitions from destroying structure.
int max_text_height_; // Horizontal lines may intersect taller text.
};
} // namespace tesseract
#endif /* TABLERECOG_H_ */
|