New addon "tesseract" for OCR

This commit is contained in:
Michael 2024-01-14 19:21:08 +00:00 committed by Hypolite Petovan
parent a179bab747
commit 4dd903b473
28 changed files with 1904 additions and 0 deletions

View file

@ -0,0 +1,14 @@
---
build: false
install:
- ps: Set-Service wuauserv -StartupType Manual
- choco install php
- choco install capture2text --version 3.9
- choco install composer
- refreshenv
- cd %APPVEYOR_BUILD_FOLDER%
- composer install
test_script:
- php tests\run.php unit e2e

View file

@ -0,0 +1,19 @@
Copyright (c) 2012-2021 Thiago Alessio Pereira
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,508 @@
<img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/logo.png" alt="Tesseract OCR for PHP" align="right" width="320px"/>
# Tesseract OCR for PHP
A wrapper to work with Tesseract OCR inside PHP.
[![CI][ci_badge]][ci]
[![AppVeyor][appveyor_badge]][appveyor]
[![Codacy][codacy_badge]][codacy]
[![Test Coverage][test_coverage_badge]][test_coverage]
<br/>
[![Latest Stable Version][stable_version_badge]][packagist]
[![Total Downloads][total_downloads_badge]][packagist]
[![Monthly Downloads][monthly_downloads_badge]][packagist]
## Installation
Via [Composer][]:
$ composer require thiagoalessio/tesseract_ocr
:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
<br/>
### ![][windows_icon] Note for Windows users
There are [many ways][tesseract_installation_on_windows] to install
[Tesseract OCR][] on your system, but if you just want something quick to
get up and running, I recommend installing the [Capture2Text][] package with
[Chocolatey][].
choco install capture2text --version 3.9
:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
<br/>
### ![][macos_icon] Note for macOS users
With [MacPorts][] you can install support for individual languages, like so:
$ sudo port install tesseract-<langcode>
But that is not possible with [Homebrew][]. It comes only with **English** support
by default, so if you intend to use it for other language, the quickest solution
is to install them all:
$ brew install tesseract tesseract-lang
<br/>
## Usage
### Basic usage
<img align="right" width="50%" title="The quick brown fox jumps over the lazy dog." src="./tests/EndToEnd/images/text.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('text.png'))
->run();
```
```
The quick brown fox
jumps over
the lazy dog.
```
<br/>
### Other languages
<img align="right" width="50%" title="Bülowstraße" src="./tests/EndToEnd/images/german.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('german.png'))
->lang('deu')
->run();
```
```
Bülowstraße
```
<br/>
### Multiple languages
<img align="right" width="50%" title="I eat すし y Pollo" src="./tests/EndToEnd/images/mixed-languages.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('mixed-languages.png'))
->lang('eng', 'jpn', 'spa')
->run();
```
```
I eat すし y Pollo
```
<br/>
### Inducing recognition
<img align="right" width="50%" title="8055" src="./tests/EndToEnd/images/8055.png"/>
```php
use thiagoalessio\TesseractOCR\TesseractOCR;
echo (new TesseractOCR('8055.png'))
->allowlist(range('A', 'Z'))
->run();
```
```
BOSS
```
<br/>
### Breaking CAPTCHAs
Yes, I know some of you might want to use this library for the *noble* purpose
of breaking CAPTCHAs, so please take a look at this comment:
<https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/91#issuecomment-342290510>
## API
### run
Executes a `tesseract` command, optionally receiving an integer as `timeout`,
in case you experience stalled tesseract processes.
```php
$ocr = new TesseractOCR();
$ocr->run();
```
```php
$ocr = new TesseractOCR();
$timeout = 500;
$ocr->run($timeout);
```
### image
Define the path of an image to be recognized by `tesseract`.
```php
$ocr = new TesseractOCR();
$ocr->image('/path/to/image.png');
$ocr->run();
```
### imageData
Set the image to be recognized by `tesseract` from a string, with its size.
This can be useful when dealing with files that are already loaded in memory.
You can easily retrieve the image data and size of an image object :
```php
//Using Imagick
$data = $img->getImageBlob();
$size = $img->getImageLength();
//Using GD
ob_start();
// Note that you can use any format supported by tesseract
imagepng($img, null, 0);
$size = ob_get_length();
$data = ob_get_clean();
$ocr = new TesseractOCR();
$ocr->imageData($data, $size);
$ocr->run();
```
### executable
Define a custom location of the `tesseract` executable,
if by any reason it is not present in the `$PATH`.
```php
echo (new TesseractOCR('img.png'))
->executable('/path/to/tesseract')
->run();
```
### version
Returns the current version of `tesseract`.
```php
echo (new TesseractOCR())->version();
```
### availableLanguages
Returns a list of available languages/scripts.
```php
foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
```
__More info:__ <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages-and-scripts>
### tessdataDir
Specify a custom location for the tessdata directory.
```php
echo (new TesseractOCR('img.png'))
->tessdataDir('/path')
->run();
```
### userWords
Specify the location of user words file.
This is a plain text file containing a list of words that you want to be
considered as a normal dictionary words by `tesseract`.
Useful when dealing with contents that contain technical terminology, jargon,
etc.
```
$ cat /path/to/user-words.txt
foo
bar
```
```php
echo (new TesseractOCR('img.png'))
->userWords('/path/to/user-words.txt')
->run();
```
### userPatterns
Specify the location of user patterns file.
If the contents you are dealing with have known patterns, this option can help
a lot tesseract's recognition accuracy.
```
$ cat /path/to/user-patterns.txt'
1-\d\d\d-GOOG-441
www.\n\\\*.com
```
```php
echo (new TesseractOCR('img.png'))
->userPatterns('/path/to/user-patterns.txt')
->run();
```
### lang
Define one or more languages to be used during the recognition.
A complete list of available languages can be found at:
<https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>
__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
for proper recognition of Chinese.
```php
echo (new TesseractOCR('img.png'))
->lang('lang1', 'lang2', 'lang3')
->run();
```
### psm
Specify the Page Segmentation Method, which instructs `tesseract` how to
interpret the given image.
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality#page-segmentation-method>
```php
echo (new TesseractOCR('img.png'))
->psm(6)
->run();
```
### oem
Specify the OCR Engine Mode. (see `tesseract --help-oem`)
```php
echo (new TesseractOCR('img.png'))
->oem(2)
->run();
```
### dpi
Specify the image DPI. It is useful if your image does not contain this information in its metadata.
```php
echo (new TesseractOCR('img.png'))
->dpi(300)
->run();
```
### allowlist
This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
```php
echo (new TesseractOCR('img.png'))
->allowlist(range('a', 'z'), range(0, 9), '-_@')
->run();
```
### configFile
Specify a config file to be used. It can either be the path to your own
config file or the name of one of the predefined config files:
<https://github.com/tesseract-ocr/tesseract/tree/master/tessdata/configs>
```php
echo (new TesseractOCR('img.png'))
->configFile('hocr')
->run();
```
### setOutputFile
Specify an Outputfile to be used. Be aware: If you set an outputfile then
the option `withoutTempFiles` is ignored.
Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
In combination with `configFile` you are able to get the `hocr`, `tsv` or
`pdf` files.
```php
echo (new TesseractOCR('img.png'))
->configFile('pdf')
->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
->run();
```
### digits
Shortcut for `->configFile('digits')`.
```php
echo (new TesseractOCR('img.png'))
->digits()
->run();
```
### hocr
Shortcut for `->configFile('hocr')`.
```php
echo (new TesseractOCR('img.png'))
->hocr()
->run();
```
### pdf
Shortcut for `->configFile('pdf')`.
```php
echo (new TesseractOCR('img.png'))
->pdf()
->run();
```
### quiet
Shortcut for `->configFile('quiet')`.
```php
echo (new TesseractOCR('img.png'))
->quiet()
->run();
```
### tsv
Shortcut for `->configFile('tsv')`.
```php
echo (new TesseractOCR('img.png'))
->tsv()
->run();
```
### txt
Shortcut for `->configFile('txt')`.
```php
echo (new TesseractOCR('img.png'))
->txt()
->run();
```
### tempDir
Define a custom directory to store temporary files generated by tesseract.
Make sure the directory actually exists and the user running `php` is allowed
to write in there.
```php
echo (new TesseractOCR('img.png'))
->tempDir('./my/custom/temp/dir')
->run();
```
### withoutTempFiles
Specify that `tesseract` should output the recognized text without writing to temporary files.
The data is gathered from the standard output of `tesseract` instead.
```php
echo (new TesseractOCR('img.png'))
->withoutTempFiles()
->run();
```
### Other options
Any configuration option offered by Tesseract can be used like that:
```php
echo (new TesseractOCR('img.png'))
->config('config_var', 'value')
->config('other_config_var', 'other value')
->run();
```
Or like that:
```php
echo (new TesseractOCR('img.png'))
->configVar('value')
->otherConfigVar('other value')
->run();
```
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>
### Thread-limit
Sometimes, it may be useful to limit the number of threads that tesseract is
allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
Set the maxmium number of threads as param for the `run` function:
```php
echo (new TesseractOCR('img.png'))
->threadLimit(1)
->run();
```
## How to contribute
You can contribute to this project by:
* Opening an [Issue][] if you found a bug or wish to propose a new feature;
* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
or implement a new feature;
Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
instructions.
## License
tesseract-ocr-for-php is released under the [MIT License][].
<h2></h2><p align="center"><sub>Made with <sub><a href="#"><img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/heart.svg" alt="love" width="14px"/></a></sub> in Berlin</sub></p>
[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
[Composer]: http://getcomposer.org/
[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
[Capture2Text]: https://chocolatey.org/packages/capture2text
[Chocolatey]: https://chocolatey.org
[MacPorts]: https://www.macports.org
[Homebrew]: https://brew.sh
[@daijiale]: https://github.com/daijiale
[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE

View file

@ -0,0 +1,4 @@
fixes:
- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
- "C:\\projects\\tesseract-ocr-for-php\\::"

View file

@ -0,0 +1,35 @@
{
"name": "thiagoalessio/tesseract_ocr",
"description": "A wrapper to work with Tesseract OCR inside PHP.",
"version": "2.13.0",
"type": "library",
"keywords": ["Tesseract", "OCR", "text recognition"],
"license": "MIT",
"authors": [
{
"name": "thiagoalessio",
"email": "thiagoalessio@me.com"
}
],
"support": {
"issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
"irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
"source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
},
"require": {
"php": "^5.3 || ^7.0 || ^8.0"
},
"require-dev": {
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
},
"autoload": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
}
}
}

View file

@ -0,0 +1,80 @@
<?php namespace thiagoalessio\TesseractOCR;
class Command
{
public $executable = 'tesseract';
public $useFileAsInput = true;
public $useFileAsOutput = true;
public $options = array();
public $configFile;
public $tempDir;
public $threadLimit;
public $image;
public $imageSize;
private $outputFile;
public function __construct($image=null, $outputFile=null)
{
$this->image = $image;
$this->outputFile = $outputFile;
}
public function build() { return "$this"; }
public function __toString()
{
$cmd = array();
if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
$cmd[] = self::escape($this->executable);
$cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
$cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
$version = $this->getTesseractVersion();
foreach ($this->options as $option) {
$cmd[] = is_callable($option) ? $option($version) : "$option";
}
if ($this->configFile) $cmd[] = $this->configFile;
return join(' ', $cmd);
}
public function getOutputFile($withExt=true)
{
if (!$this->outputFile)
$this->outputFile = $this->getTempDir()
.DIRECTORY_SEPARATOR
.basename(tempnam($this->getTempDir(), 'ocr'));
if (!$withExt) return $this->outputFile;
$hasCustomExt = array('hocr', 'tsv', 'pdf');
$ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
return "{$this->outputFile}.{$ext}";
}
public function getTempDir()
{
return $this->tempDir ?: sys_get_temp_dir();
}
public function getTesseractVersion()
{
exec(self::escape($this->executable).' --version 2>&1', $output);
$outputParts = explode(' ', $output[0]);
return $outputParts[1];
}
public function getAvailableLanguages()
{
exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
array_shift($output);
sort($output);
return $output;
}
public static function escape($str)
{
$charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
return '"'.addcslashes($str, $charlist).'"';
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class FeatureNotAvailableException extends TesseractOcrException
{
}

View file

@ -0,0 +1,120 @@
<?php namespace thiagoalessio\TesseractOCR;
class FriendlyErrors
{
public static function checkImagePath($image)
{
if (file_exists($image)) return;
$currentDir = __DIR__;
$msg = array();
$msg[] = "Error! The image \"$image\" was not found.";
$msg[] = '';
$msg[] = "The current __DIR__ is $currentDir";
$msg = join(PHP_EOL, $msg);
throw new ImageNotFoundException($msg);
}
public static function checkTesseractPresence($executable)
{
if (file_exists($executable)) return;
$cmd = stripos(PHP_OS, 'win') === 0
? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
: 'type '.Command::escape($executable).' > /dev/null 2>&1';
system($cmd, $exitCode);
if ($exitCode == 0) return;
$currentPath = getenv('PATH');
$msg = array();
$msg[] = "Error! The command \"$executable\" was not found.";
$msg[] = '';
$msg[] = 'Make sure you have Tesseract OCR installed on your system:';
$msg[] = 'https://github.com/tesseract-ocr/tesseract';
$msg[] = '';
$msg[] = "The current \$PATH is $currentPath";
$msg = join(PHP_EOL, $msg);
throw new TesseractNotFoundException($msg);
}
public static function checkCommandExecution($command, $stdout, $stderr)
{
if ($command->useFileAsOutput) {
$file = $command->getOutputFile();
if (file_exists($file) && filesize($file) > 0) return;
}
if (!$command->useFileAsOutput && $stdout) {
return;
}
$msg = array();
$msg[] = 'Error! The command did not produce any output.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg[] = '';
$msg[] = 'Returned message:';
$arrayStderr = explode(PHP_EOL, $stderr);
array_pop($arrayStderr);
$msg = array_merge($msg, $arrayStderr);
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkProcessCreation($processHandle, $command)
{
if ($processHandle !== FALSE) return;
$msg = array();
$msg[] = 'Error! The command could not be launched.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkTesseractVersion($expected, $action, $command)
{
$actual = $command->getTesseractVersion();
if ($actual[0] === 'v')
$actual = substr($actual, 1);
if (version_compare($actual, $expected, ">=")) return;
$msg = array();
$msg[] = "Error! $action is not available this tesseract version";
$msg[] = "Required version is $expected, actual version is $actual";
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new FeatureNotAvailableException($msg);
}
public static function checkWritePermissions($path)
{
if (!is_dir(dirname($path))) mkdir(dirname($path));
$writableDirectory = is_writable(dirname($path));
$writableFile = true;
if (file_exists($path)) $writableFile = is_writable($path);
if ($writableFile && $writableDirectory) return;
$msg = array();
$msg[] = "Error! No permission to write to $path";
$msg[] = "Make sure you have the right outputFile and permissions "
."to write to the folder";
$msg[] = '';
$msg = join(PHP_EOL, $msg);
throw new NoWritePermissionsForOutputFile($msg);
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class ImageNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class NoWritePermissionsForOutputFile extends TesseractOcrException
{
}

View file

@ -0,0 +1,79 @@
<?php namespace thiagoalessio\TesseractOCR;
class Option
{
public static function psm($psm)
{
return function($version) use ($psm) {
$version = preg_replace('/^v/', '', $version);
return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
};
}
public static function oem($oem)
{
return function($version) use ($oem) {
Option::checkMinVersion('3.05', $version, 'oem');
return "--oem $oem";
};
}
public static function dpi($dpi)
{
return function() use ($dpi) {
return "--dpi $dpi";
};
}
public static function userWords($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-words');
return '--user-words "'.addcslashes($path, '\\"').'"';
};
}
public static function userPatterns($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-patterns');
return '--user-patterns "'.addcslashes($path, '\\"').'"';
};
}
public static function tessdataDir($path)
{
return function() use ($path) {
return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
};
}
public static function lang()
{
$languages = func_get_args();
return function() use ($languages) {
return '-l '.join('+', $languages);
};
}
public static function config($var, $value)
{
return function() use($var, $value) {
$snakeCase = function($str) {
return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
};
$pair = $snakeCase($var).'='.$value;
return '-c "'.addcslashes($pair, '\\"').'"';
};
}
public static function checkMinVersion($minVersion, $currVersion, $option)
{
$minVersion = preg_replace('/^v/', '', $minVersion);
$currVersion = preg_replace('/^v/', '', $currVersion);
if (!version_compare($currVersion, $minVersion, '<')) return;
$msg = "$option option is only available on Tesseract $minVersion or later.";
$msg.= PHP_EOL."Your version of Tesseract is $currVersion";
throw new \Exception($msg);
}
}

View file

@ -0,0 +1,83 @@
<?php namespace thiagoalessio\TesseractOCR;
class Process {
private $stdin;
private $stdout;
private $stderr;
private $handle;
private $startTime;
public function __construct($command)
{
$this->startTime = microtime(true);
$streamDescriptors = [
array("pipe", "r"),
array("pipe", "w"),
array("pipe", "w")
];
$this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
list($this->stdin, $this->stdout, $this->stderr) = $pipes;
FriendlyErrors::checkProcessCreation($this->handle, $command);
//This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
stream_set_blocking($this->stdout, 0);
stream_set_blocking($this->stderr, 0);
}
public function write($data, $len)
{
$total = 0;
do
{
$res = fwrite($this->stdin, substr($data, $total));
} while($res && $total += $res < $len);
return $total === $len;
}
public function wait($timeout = 0)
{
$running = true;
$data = ["out" => "", "err" => ""];
while (($running === true) && !$this->hasTimedOut($timeout))
{
$data["out"] .= fread($this->stdout, 8192);
$data["err"] .= fread($this->stderr, 8192);
$procInfo = proc_get_status($this->handle);
$running = $procInfo["running"];
if ($running) {
usleep(1000); // Sleep 1ms to yield CPU time
}
}
return $data;
}
public function close()
{
$this->closeStream($this->stdin);
$this->closeStream($this->stdout);
$this->closeStream($this->stderr);
return proc_close($this->handle);
}
public function closeStdin()
{
$this->closeStream($this->stdin);
}
private function hasTimedOut($timeout)
{
return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
}
private function closeStream(&$stream)
{
if ($stream !== NULL)
{
fclose($stream);
$stream = NULL;
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class TesseractNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,181 @@
<?php namespace thiagoalessio\TesseractOCR;
use thiagoalessio\TesseractOCR\Command;
use thiagoalessio\TesseractOCR\Option;
use thiagoalessio\TesseractOCR\FriendlyErrors;
class TesseractOCR
{
public $command;
private $outputFile = null;
public function __construct($image=null, $command=null)
{
$this->command = $command ?: new Command;
$this->image("$image");
}
public function run($timeout = 0)
{
try {
if ($this->outputFile !== null) {
FriendlyErrors::checkWritePermissions($this->outputFile);
$this->command->useFileAsOutput = true;
}
FriendlyErrors::checkTesseractPresence($this->command->executable);
if ($this->command->useFileAsInput) {
FriendlyErrors::checkImagePath($this->command->image);
}
$process = new Process("{$this->command}");
if (!$this->command->useFileAsInput) {
$process->write($this->command->image, $this->command->imageSize);
$process->closeStdin();
}
$output = $process->wait($timeout);
FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
}
catch (TesseractOcrException $e) {
if ($this->command->useFileAsOutput) $this->cleanTempFiles();
throw $e;
}
if ($this->command->useFileAsOutput) {
$text = file_get_contents($this->command->getOutputFile());
if ($this->outputFile !== null) {
rename($this->command->getOutputFile(), $this->outputFile);
}
$this->cleanTempFiles();
}
else
$text = $output["out"];
return trim($text, " \t\n\r\0\x0A\x0B\x0C");
}
public function imageData($image, $size)
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
$this->command->useFileAsInput = false;
$this->command->image = $image;
$this->command->imageSize = $size;
return $this;
}
public function withoutTempFiles()
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
$this->command->useFileAsOutput = false;
return $this;
}
public function image($image)
{
$this->command->image = $image;
return $this;
}
public function executable($executable)
{
FriendlyErrors::checkTesseractPresence($executable);
$this->command->executable = $executable;
return $this;
}
public function configFile($configFile)
{
$this->command->configFile = $configFile;
return $this;
}
public function tempDir($tempDir)
{
$this->command->tempDir = $tempDir;
return $this;
}
public function threadLimit($limit)
{
$this->command->threadLimit = $limit;
return $this;
}
// @deprecated
public function format($fmt) { return $this->configFile($fmt); }
public function setOutputFile($path) {
$this->outputFile = $path;
return $this;
}
public function allowlist()
{
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
$this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
return $this;
}
public function whitelist()
{
$warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
trigger_error($warningMsg, E_USER_NOTICE);
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
return $this->allowlist($allowlist);
}
public function version()
{
return $this->command->getTesseractVersion();
}
public function availableLanguages()
{
return $this->command->getAvailableLanguages();
}
public function __call($method, $args)
{
if ($this->isConfigFile($method)) return $this->configFile($method);
if ($this->isOption($method)) {
$option = $this->getOptionClassName().'::'.$method;
$this->command->options[] = call_user_func_array($option, $args);
return $this;
}
$arg = empty($args) ? null : $args[0];
$this->command->options[] = Option::config($method, $arg);
return $this;
}
private function isConfigFile($name)
{
return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
}
private function isOption($name)
{
return in_array($name, get_class_methods($this->getOptionClassName()));
}
private function getOptionClassName()
{
return __NAMESPACE__.'\\Option';
}
private function cleanTempFiles()
{
if (file_exists($this->command->getOutputFile(false))) {
unlink($this->command->getOutputFile(false));
}
if (file_exists($this->command->getOutputFile(true))) {
unlink($this->command->getOutputFile(true));
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
abstract class TesseractOcrException extends \Exception
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class UnsuccessfulCommandException extends TesseractOcrException
{
}