mirror of
https://git.friendi.ca/friendica/friendica-addons.git
synced 2025-07-11 02:48:48 +00:00
New addon "tesseract" for OCR
This commit is contained in:
parent
a179bab747
commit
4dd903b473
28 changed files with 1904 additions and 0 deletions
14
tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml
vendored
Normal file
14
tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml
vendored
Normal file
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
build: false
|
||||
|
||||
install:
|
||||
- ps: Set-Service wuauserv -StartupType Manual
|
||||
- choco install php
|
||||
- choco install capture2text --version 3.9
|
||||
- choco install composer
|
||||
- refreshenv
|
||||
- cd %APPVEYOR_BUILD_FOLDER%
|
||||
- composer install
|
||||
|
||||
test_script:
|
||||
- php tests\run.php unit e2e
|
19
tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE
vendored
Normal file
19
tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
Copyright (c) 2012-2021 Thiago Alessio Pereira
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
508
tesseract/vendor/thiagoalessio/tesseract_ocr/README.md
vendored
Normal file
508
tesseract/vendor/thiagoalessio/tesseract_ocr/README.md
vendored
Normal file
|
@ -0,0 +1,508 @@
|
|||
<img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/logo.png" alt="Tesseract OCR for PHP" align="right" width="320px"/>
|
||||
|
||||
# Tesseract OCR for PHP
|
||||
|
||||
A wrapper to work with Tesseract OCR inside PHP.
|
||||
|
||||
[![CI][ci_badge]][ci]
|
||||
[![AppVeyor][appveyor_badge]][appveyor]
|
||||
[![Codacy][codacy_badge]][codacy]
|
||||
[![Test Coverage][test_coverage_badge]][test_coverage]
|
||||
<br/>
|
||||
[![Latest Stable Version][stable_version_badge]][packagist]
|
||||
[![Total Downloads][total_downloads_badge]][packagist]
|
||||
[![Monthly Downloads][monthly_downloads_badge]][packagist]
|
||||
|
||||
## Installation
|
||||
|
||||
Via [Composer][]:
|
||||
|
||||
$ composer require thiagoalessio/tesseract_ocr
|
||||
|
||||
:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
|
||||
|
||||
<br/>
|
||||
|
||||
### ![][windows_icon] Note for Windows users
|
||||
|
||||
There are [many ways][tesseract_installation_on_windows] to install
|
||||
[Tesseract OCR][] on your system, but if you just want something quick to
|
||||
get up and running, I recommend installing the [Capture2Text][] package with
|
||||
[Chocolatey][].
|
||||
|
||||
choco install capture2text --version 3.9
|
||||
|
||||
:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
|
||||
|
||||
<br/>
|
||||
|
||||
### ![][macos_icon] Note for macOS users
|
||||
|
||||
With [MacPorts][] you can install support for individual languages, like so:
|
||||
|
||||
$ sudo port install tesseract-<langcode>
|
||||
|
||||
But that is not possible with [Homebrew][]. It comes only with **English** support
|
||||
by default, so if you intend to use it for other language, the quickest solution
|
||||
is to install them all:
|
||||
|
||||
$ brew install tesseract tesseract-lang
|
||||
|
||||
<br/>
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic usage
|
||||
|
||||
<img align="right" width="50%" title="The quick brown fox jumps over the lazy dog." src="./tests/EndToEnd/images/text.png"/>
|
||||
|
||||
```php
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
echo (new TesseractOCR('text.png'))
|
||||
->run();
|
||||
```
|
||||
|
||||
```
|
||||
The quick brown fox
|
||||
jumps over
|
||||
the lazy dog.
|
||||
```
|
||||
|
||||
<br/>
|
||||
|
||||
### Other languages
|
||||
|
||||
<img align="right" width="50%" title="Bülowstraße" src="./tests/EndToEnd/images/german.png"/>
|
||||
|
||||
```php
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
echo (new TesseractOCR('german.png'))
|
||||
->lang('deu')
|
||||
->run();
|
||||
```
|
||||
|
||||
```
|
||||
Bülowstraße
|
||||
```
|
||||
|
||||
<br/>
|
||||
|
||||
### Multiple languages
|
||||
|
||||
<img align="right" width="50%" title="I eat すし y Pollo" src="./tests/EndToEnd/images/mixed-languages.png"/>
|
||||
|
||||
```php
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
echo (new TesseractOCR('mixed-languages.png'))
|
||||
->lang('eng', 'jpn', 'spa')
|
||||
->run();
|
||||
```
|
||||
|
||||
```
|
||||
I eat すし y Pollo
|
||||
```
|
||||
|
||||
<br/>
|
||||
|
||||
### Inducing recognition
|
||||
|
||||
<img align="right" width="50%" title="8055" src="./tests/EndToEnd/images/8055.png"/>
|
||||
|
||||
```php
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
echo (new TesseractOCR('8055.png'))
|
||||
->allowlist(range('A', 'Z'))
|
||||
->run();
|
||||
```
|
||||
|
||||
```
|
||||
BOSS
|
||||
```
|
||||
|
||||
<br/>
|
||||
|
||||
### Breaking CAPTCHAs
|
||||
|
||||
Yes, I know some of you might want to use this library for the *noble* purpose
|
||||
of breaking CAPTCHAs, so please take a look at this comment:
|
||||
|
||||
<https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/91#issuecomment-342290510>
|
||||
|
||||
## API
|
||||
|
||||
### run
|
||||
|
||||
Executes a `tesseract` command, optionally receiving an integer as `timeout`,
|
||||
in case you experience stalled tesseract processes.
|
||||
|
||||
```php
|
||||
$ocr = new TesseractOCR();
|
||||
$ocr->run();
|
||||
```
|
||||
```php
|
||||
$ocr = new TesseractOCR();
|
||||
$timeout = 500;
|
||||
$ocr->run($timeout);
|
||||
```
|
||||
|
||||
### image
|
||||
|
||||
Define the path of an image to be recognized by `tesseract`.
|
||||
|
||||
```php
|
||||
$ocr = new TesseractOCR();
|
||||
$ocr->image('/path/to/image.png');
|
||||
$ocr->run();
|
||||
```
|
||||
|
||||
### imageData
|
||||
|
||||
Set the image to be recognized by `tesseract` from a string, with its size.
|
||||
This can be useful when dealing with files that are already loaded in memory.
|
||||
You can easily retrieve the image data and size of an image object :
|
||||
```php
|
||||
//Using Imagick
|
||||
$data = $img->getImageBlob();
|
||||
$size = $img->getImageLength();
|
||||
//Using GD
|
||||
ob_start();
|
||||
// Note that you can use any format supported by tesseract
|
||||
imagepng($img, null, 0);
|
||||
$size = ob_get_length();
|
||||
$data = ob_get_clean();
|
||||
|
||||
$ocr = new TesseractOCR();
|
||||
$ocr->imageData($data, $size);
|
||||
$ocr->run();
|
||||
```
|
||||
|
||||
### executable
|
||||
|
||||
Define a custom location of the `tesseract` executable,
|
||||
if by any reason it is not present in the `$PATH`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->executable('/path/to/tesseract')
|
||||
->run();
|
||||
```
|
||||
|
||||
### version
|
||||
|
||||
Returns the current version of `tesseract`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR())->version();
|
||||
```
|
||||
|
||||
### availableLanguages
|
||||
|
||||
Returns a list of available languages/scripts.
|
||||
|
||||
```php
|
||||
foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
|
||||
```
|
||||
|
||||
__More info:__ <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages-and-scripts>
|
||||
|
||||
### tessdataDir
|
||||
|
||||
Specify a custom location for the tessdata directory.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->tessdataDir('/path')
|
||||
->run();
|
||||
```
|
||||
|
||||
### userWords
|
||||
|
||||
Specify the location of user words file.
|
||||
|
||||
This is a plain text file containing a list of words that you want to be
|
||||
considered as a normal dictionary words by `tesseract`.
|
||||
|
||||
Useful when dealing with contents that contain technical terminology, jargon,
|
||||
etc.
|
||||
|
||||
```
|
||||
$ cat /path/to/user-words.txt
|
||||
foo
|
||||
bar
|
||||
```
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->userWords('/path/to/user-words.txt')
|
||||
->run();
|
||||
```
|
||||
|
||||
### userPatterns
|
||||
|
||||
Specify the location of user patterns file.
|
||||
|
||||
If the contents you are dealing with have known patterns, this option can help
|
||||
a lot tesseract's recognition accuracy.
|
||||
|
||||
```
|
||||
$ cat /path/to/user-patterns.txt'
|
||||
1-\d\d\d-GOOG-441
|
||||
www.\n\\\*.com
|
||||
```
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->userPatterns('/path/to/user-patterns.txt')
|
||||
->run();
|
||||
```
|
||||
|
||||
### lang
|
||||
|
||||
Define one or more languages to be used during the recognition.
|
||||
A complete list of available languages can be found at:
|
||||
<https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>
|
||||
|
||||
__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
|
||||
for proper recognition of Chinese.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->lang('lang1', 'lang2', 'lang3')
|
||||
->run();
|
||||
```
|
||||
|
||||
### psm
|
||||
|
||||
Specify the Page Segmentation Method, which instructs `tesseract` how to
|
||||
interpret the given image.
|
||||
|
||||
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality#page-segmentation-method>
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->psm(6)
|
||||
->run();
|
||||
```
|
||||
|
||||
### oem
|
||||
|
||||
Specify the OCR Engine Mode. (see `tesseract --help-oem`)
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->oem(2)
|
||||
->run();
|
||||
```
|
||||
|
||||
### dpi
|
||||
|
||||
Specify the image DPI. It is useful if your image does not contain this information in its metadata.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->dpi(300)
|
||||
->run();
|
||||
```
|
||||
|
||||
### allowlist
|
||||
|
||||
This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->allowlist(range('a', 'z'), range(0, 9), '-_@')
|
||||
->run();
|
||||
```
|
||||
|
||||
### configFile
|
||||
|
||||
Specify a config file to be used. It can either be the path to your own
|
||||
config file or the name of one of the predefined config files:
|
||||
<https://github.com/tesseract-ocr/tesseract/tree/master/tessdata/configs>
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->configFile('hocr')
|
||||
->run();
|
||||
```
|
||||
|
||||
### setOutputFile
|
||||
|
||||
Specify an Outputfile to be used. Be aware: If you set an outputfile then
|
||||
the option `withoutTempFiles` is ignored.
|
||||
Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
|
||||
|
||||
In combination with `configFile` you are able to get the `hocr`, `tsv` or
|
||||
`pdf` files.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->configFile('pdf')
|
||||
->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
|
||||
->run();
|
||||
```
|
||||
|
||||
### digits
|
||||
|
||||
Shortcut for `->configFile('digits')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->digits()
|
||||
->run();
|
||||
```
|
||||
|
||||
### hocr
|
||||
|
||||
Shortcut for `->configFile('hocr')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->hocr()
|
||||
->run();
|
||||
```
|
||||
|
||||
### pdf
|
||||
|
||||
Shortcut for `->configFile('pdf')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->pdf()
|
||||
->run();
|
||||
```
|
||||
|
||||
### quiet
|
||||
|
||||
Shortcut for `->configFile('quiet')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->quiet()
|
||||
->run();
|
||||
```
|
||||
|
||||
### tsv
|
||||
|
||||
Shortcut for `->configFile('tsv')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->tsv()
|
||||
->run();
|
||||
```
|
||||
|
||||
### txt
|
||||
|
||||
Shortcut for `->configFile('txt')`.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->txt()
|
||||
->run();
|
||||
```
|
||||
|
||||
### tempDir
|
||||
|
||||
Define a custom directory to store temporary files generated by tesseract.
|
||||
Make sure the directory actually exists and the user running `php` is allowed
|
||||
to write in there.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->tempDir('./my/custom/temp/dir')
|
||||
->run();
|
||||
```
|
||||
|
||||
### withoutTempFiles
|
||||
|
||||
Specify that `tesseract` should output the recognized text without writing to temporary files.
|
||||
The data is gathered from the standard output of `tesseract` instead.
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->withoutTempFiles()
|
||||
->run();
|
||||
```
|
||||
|
||||
### Other options
|
||||
|
||||
Any configuration option offered by Tesseract can be used like that:
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->config('config_var', 'value')
|
||||
->config('other_config_var', 'other value')
|
||||
->run();
|
||||
```
|
||||
|
||||
Or like that:
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->configVar('value')
|
||||
->otherConfigVar('other value')
|
||||
->run();
|
||||
```
|
||||
|
||||
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>
|
||||
|
||||
### Thread-limit
|
||||
|
||||
Sometimes, it may be useful to limit the number of threads that tesseract is
|
||||
allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
|
||||
Set the maxmium number of threads as param for the `run` function:
|
||||
|
||||
```php
|
||||
echo (new TesseractOCR('img.png'))
|
||||
->threadLimit(1)
|
||||
->run();
|
||||
```
|
||||
|
||||
## How to contribute
|
||||
|
||||
You can contribute to this project by:
|
||||
|
||||
* Opening an [Issue][] if you found a bug or wish to propose a new feature;
|
||||
* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
|
||||
or implement a new feature;
|
||||
|
||||
Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
|
||||
instructions.
|
||||
|
||||
## License
|
||||
|
||||
tesseract-ocr-for-php is released under the [MIT License][].
|
||||
|
||||
|
||||
<h2></h2><p align="center"><sub>Made with <sub><a href="#"><img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/heart.svg" alt="love" width="14px"/></a></sub> in Berlin</sub></p>
|
||||
|
||||
[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
|
||||
[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
|
||||
[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
|
||||
[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
|
||||
[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
|
||||
[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
|
||||
[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
|
||||
[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
|
||||
[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
|
||||
[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
|
||||
[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
|
||||
[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
|
||||
[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
|
||||
[Composer]: http://getcomposer.org/
|
||||
[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
|
||||
[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
|
||||
[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
|
||||
[Capture2Text]: https://chocolatey.org/packages/capture2text
|
||||
[Chocolatey]: https://chocolatey.org
|
||||
[MacPorts]: https://www.macports.org
|
||||
[Homebrew]: https://brew.sh
|
||||
[@daijiale]: https://github.com/daijiale
|
||||
[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
|
||||
[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
|
||||
[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
|
||||
[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
|
||||
[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
|
||||
[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
|
||||
[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE
|
4
tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml
vendored
Normal file
4
tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
fixes:
|
||||
- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
|
||||
- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
|
||||
- "C:\\projects\\tesseract-ocr-for-php\\::"
|
35
tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json
vendored
Normal file
35
tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"name": "thiagoalessio/tesseract_ocr",
|
||||
"description": "A wrapper to work with Tesseract OCR inside PHP.",
|
||||
"version": "2.13.0",
|
||||
"type": "library",
|
||||
"keywords": ["Tesseract", "OCR", "text recognition"],
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
{
|
||||
"name": "thiagoalessio",
|
||||
"email": "thiagoalessio@me.com"
|
||||
}
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
|
||||
"irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
|
||||
"source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
|
||||
},
|
||||
"require": {
|
||||
"php": "^5.3 || ^7.0 || ^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"thiagoalessio\\TesseractOCR\\": "src/"
|
||||
}
|
||||
},
|
||||
"autoload-dev": {
|
||||
"psr-4": {
|
||||
"thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
|
||||
}
|
||||
}
|
||||
}
|
80
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php
vendored
Normal file
80
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
<?php namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class Command
|
||||
{
|
||||
public $executable = 'tesseract';
|
||||
public $useFileAsInput = true;
|
||||
public $useFileAsOutput = true;
|
||||
public $options = array();
|
||||
public $configFile;
|
||||
public $tempDir;
|
||||
public $threadLimit;
|
||||
public $image;
|
||||
public $imageSize;
|
||||
private $outputFile;
|
||||
|
||||
public function __construct($image=null, $outputFile=null)
|
||||
{
|
||||
$this->image = $image;
|
||||
$this->outputFile = $outputFile;
|
||||
}
|
||||
|
||||
public function build() { return "$this"; }
|
||||
|
||||
public function __toString()
|
||||
{
|
||||
$cmd = array();
|
||||
if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
|
||||
$cmd[] = self::escape($this->executable);
|
||||
$cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
|
||||
$cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
|
||||
|
||||
$version = $this->getTesseractVersion();
|
||||
|
||||
foreach ($this->options as $option) {
|
||||
$cmd[] = is_callable($option) ? $option($version) : "$option";
|
||||
}
|
||||
if ($this->configFile) $cmd[] = $this->configFile;
|
||||
|
||||
return join(' ', $cmd);
|
||||
}
|
||||
|
||||
public function getOutputFile($withExt=true)
|
||||
{
|
||||
if (!$this->outputFile)
|
||||
$this->outputFile = $this->getTempDir()
|
||||
.DIRECTORY_SEPARATOR
|
||||
.basename(tempnam($this->getTempDir(), 'ocr'));
|
||||
if (!$withExt) return $this->outputFile;
|
||||
|
||||
$hasCustomExt = array('hocr', 'tsv', 'pdf');
|
||||
$ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
|
||||
return "{$this->outputFile}.{$ext}";
|
||||
}
|
||||
|
||||
public function getTempDir()
|
||||
{
|
||||
return $this->tempDir ?: sys_get_temp_dir();
|
||||
}
|
||||
|
||||
public function getTesseractVersion()
|
||||
{
|
||||
exec(self::escape($this->executable).' --version 2>&1', $output);
|
||||
$outputParts = explode(' ', $output[0]);
|
||||
return $outputParts[1];
|
||||
}
|
||||
|
||||
public function getAvailableLanguages()
|
||||
{
|
||||
exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
|
||||
array_shift($output);
|
||||
sort($output);
|
||||
return $output;
|
||||
}
|
||||
|
||||
public static function escape($str)
|
||||
{
|
||||
$charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
|
||||
return '"'.addcslashes($str, $charlist).'"';
|
||||
}
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class FeatureNotAvailableException extends TesseractOcrException
|
||||
{
|
||||
}
|
120
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FriendlyErrors.php
vendored
Normal file
120
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FriendlyErrors.php
vendored
Normal file
|
@ -0,0 +1,120 @@
|
|||
<?php namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class FriendlyErrors
|
||||
{
|
||||
public static function checkImagePath($image)
|
||||
{
|
||||
if (file_exists($image)) return;
|
||||
|
||||
$currentDir = __DIR__;
|
||||
$msg = array();
|
||||
$msg[] = "Error! The image \"$image\" was not found.";
|
||||
$msg[] = '';
|
||||
$msg[] = "The current __DIR__ is $currentDir";
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new ImageNotFoundException($msg);
|
||||
}
|
||||
|
||||
public static function checkTesseractPresence($executable)
|
||||
{
|
||||
if (file_exists($executable)) return;
|
||||
|
||||
$cmd = stripos(PHP_OS, 'win') === 0
|
||||
? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
|
||||
: 'type '.Command::escape($executable).' > /dev/null 2>&1';
|
||||
system($cmd, $exitCode);
|
||||
|
||||
if ($exitCode == 0) return;
|
||||
|
||||
$currentPath = getenv('PATH');
|
||||
$msg = array();
|
||||
$msg[] = "Error! The command \"$executable\" was not found.";
|
||||
$msg[] = '';
|
||||
$msg[] = 'Make sure you have Tesseract OCR installed on your system:';
|
||||
$msg[] = 'https://github.com/tesseract-ocr/tesseract';
|
||||
$msg[] = '';
|
||||
$msg[] = "The current \$PATH is $currentPath";
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new TesseractNotFoundException($msg);
|
||||
}
|
||||
|
||||
public static function checkCommandExecution($command, $stdout, $stderr)
|
||||
{
|
||||
if ($command->useFileAsOutput) {
|
||||
$file = $command->getOutputFile();
|
||||
if (file_exists($file) && filesize($file) > 0) return;
|
||||
}
|
||||
|
||||
if (!$command->useFileAsOutput && $stdout) {
|
||||
return;
|
||||
}
|
||||
|
||||
$msg = array();
|
||||
$msg[] = 'Error! The command did not produce any output.';
|
||||
$msg[] = '';
|
||||
$msg[] = 'Generated command:';
|
||||
$msg[] = "$command";
|
||||
$msg[] = '';
|
||||
$msg[] = 'Returned message:';
|
||||
$arrayStderr = explode(PHP_EOL, $stderr);
|
||||
array_pop($arrayStderr);
|
||||
$msg = array_merge($msg, $arrayStderr);
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new UnsuccessfulCommandException($msg);
|
||||
}
|
||||
|
||||
public static function checkProcessCreation($processHandle, $command)
|
||||
{
|
||||
if ($processHandle !== FALSE) return;
|
||||
|
||||
$msg = array();
|
||||
$msg[] = 'Error! The command could not be launched.';
|
||||
$msg[] = '';
|
||||
$msg[] = 'Generated command:';
|
||||
$msg[] = "$command";
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new UnsuccessfulCommandException($msg);
|
||||
}
|
||||
|
||||
public static function checkTesseractVersion($expected, $action, $command)
|
||||
{
|
||||
$actual = $command->getTesseractVersion();
|
||||
|
||||
if ($actual[0] === 'v')
|
||||
$actual = substr($actual, 1);
|
||||
|
||||
if (version_compare($actual, $expected, ">=")) return;
|
||||
|
||||
$msg = array();
|
||||
$msg[] = "Error! $action is not available this tesseract version";
|
||||
$msg[] = "Required version is $expected, actual version is $actual";
|
||||
$msg[] = '';
|
||||
$msg[] = 'Generated command:';
|
||||
$msg[] = "$command";
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new FeatureNotAvailableException($msg);
|
||||
}
|
||||
|
||||
public static function checkWritePermissions($path)
|
||||
{
|
||||
if (!is_dir(dirname($path))) mkdir(dirname($path));
|
||||
$writableDirectory = is_writable(dirname($path));
|
||||
$writableFile = true;
|
||||
if (file_exists($path)) $writableFile = is_writable($path);
|
||||
if ($writableFile && $writableDirectory) return;
|
||||
|
||||
$msg = array();
|
||||
$msg[] = "Error! No permission to write to $path";
|
||||
$msg[] = "Make sure you have the right outputFile and permissions "
|
||||
."to write to the folder";
|
||||
$msg[] = '';
|
||||
$msg = join(PHP_EOL, $msg);
|
||||
|
||||
throw new NoWritePermissionsForOutputFile($msg);
|
||||
}
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class ImageNotFoundException extends TesseractOcrException
|
||||
{
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/NoWritePermissionsForOutputFile.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/NoWritePermissionsForOutputFile.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class NoWritePermissionsForOutputFile extends TesseractOcrException
|
||||
{
|
||||
}
|
79
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Option.php
vendored
Normal file
79
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Option.php
vendored
Normal file
|
@ -0,0 +1,79 @@
|
|||
<?php namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class Option
|
||||
{
|
||||
public static function psm($psm)
|
||||
{
|
||||
return function($version) use ($psm) {
|
||||
$version = preg_replace('/^v/', '', $version);
|
||||
return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
|
||||
};
|
||||
}
|
||||
|
||||
public static function oem($oem)
|
||||
{
|
||||
return function($version) use ($oem) {
|
||||
Option::checkMinVersion('3.05', $version, 'oem');
|
||||
return "--oem $oem";
|
||||
};
|
||||
}
|
||||
|
||||
public static function dpi($dpi)
|
||||
{
|
||||
return function() use ($dpi) {
|
||||
return "--dpi $dpi";
|
||||
};
|
||||
}
|
||||
|
||||
public static function userWords($path)
|
||||
{
|
||||
return function($version) use ($path) {
|
||||
Option::checkMinVersion('3.04', $version, 'user-words');
|
||||
return '--user-words "'.addcslashes($path, '\\"').'"';
|
||||
};
|
||||
}
|
||||
|
||||
public static function userPatterns($path)
|
||||
{
|
||||
return function($version) use ($path) {
|
||||
Option::checkMinVersion('3.04', $version, 'user-patterns');
|
||||
return '--user-patterns "'.addcslashes($path, '\\"').'"';
|
||||
};
|
||||
}
|
||||
|
||||
public static function tessdataDir($path)
|
||||
{
|
||||
return function() use ($path) {
|
||||
return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
|
||||
};
|
||||
}
|
||||
|
||||
public static function lang()
|
||||
{
|
||||
$languages = func_get_args();
|
||||
return function() use ($languages) {
|
||||
return '-l '.join('+', $languages);
|
||||
};
|
||||
}
|
||||
|
||||
public static function config($var, $value)
|
||||
{
|
||||
return function() use($var, $value) {
|
||||
$snakeCase = function($str) {
|
||||
return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
|
||||
};
|
||||
$pair = $snakeCase($var).'='.$value;
|
||||
return '-c "'.addcslashes($pair, '\\"').'"';
|
||||
};
|
||||
}
|
||||
|
||||
public static function checkMinVersion($minVersion, $currVersion, $option)
|
||||
{
|
||||
$minVersion = preg_replace('/^v/', '', $minVersion);
|
||||
$currVersion = preg_replace('/^v/', '', $currVersion);
|
||||
if (!version_compare($currVersion, $minVersion, '<')) return;
|
||||
$msg = "$option option is only available on Tesseract $minVersion or later.";
|
||||
$msg.= PHP_EOL."Your version of Tesseract is $currVersion";
|
||||
throw new \Exception($msg);
|
||||
}
|
||||
}
|
83
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php
vendored
Normal file
83
tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php
vendored
Normal file
|
@ -0,0 +1,83 @@
|
|||
<?php namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class Process {
|
||||
|
||||
private $stdin;
|
||||
private $stdout;
|
||||
private $stderr;
|
||||
private $handle;
|
||||
private $startTime;
|
||||
|
||||
public function __construct($command)
|
||||
{
|
||||
$this->startTime = microtime(true);
|
||||
$streamDescriptors = [
|
||||
array("pipe", "r"),
|
||||
array("pipe", "w"),
|
||||
array("pipe", "w")
|
||||
];
|
||||
$this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
|
||||
list($this->stdin, $this->stdout, $this->stderr) = $pipes;
|
||||
|
||||
FriendlyErrors::checkProcessCreation($this->handle, $command);
|
||||
|
||||
//This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
|
||||
stream_set_blocking($this->stdout, 0);
|
||||
stream_set_blocking($this->stderr, 0);
|
||||
}
|
||||
|
||||
public function write($data, $len)
|
||||
{
|
||||
$total = 0;
|
||||
do
|
||||
{
|
||||
$res = fwrite($this->stdin, substr($data, $total));
|
||||
} while($res && $total += $res < $len);
|
||||
return $total === $len;
|
||||
}
|
||||
|
||||
|
||||
public function wait($timeout = 0)
|
||||
{
|
||||
$running = true;
|
||||
$data = ["out" => "", "err" => ""];
|
||||
while (($running === true) && !$this->hasTimedOut($timeout))
|
||||
{
|
||||
$data["out"] .= fread($this->stdout, 8192);
|
||||
$data["err"] .= fread($this->stderr, 8192);
|
||||
$procInfo = proc_get_status($this->handle);
|
||||
$running = $procInfo["running"];
|
||||
if ($running) {
|
||||
usleep(1000); // Sleep 1ms to yield CPU time
|
||||
}
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
|
||||
public function close()
|
||||
{
|
||||
$this->closeStream($this->stdin);
|
||||
$this->closeStream($this->stdout);
|
||||
$this->closeStream($this->stderr);
|
||||
return proc_close($this->handle);
|
||||
}
|
||||
|
||||
public function closeStdin()
|
||||
{
|
||||
$this->closeStream($this->stdin);
|
||||
}
|
||||
|
||||
private function hasTimedOut($timeout)
|
||||
{
|
||||
return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
|
||||
}
|
||||
|
||||
private function closeStream(&$stream)
|
||||
{
|
||||
if ($stream !== NULL)
|
||||
{
|
||||
fclose($stream);
|
||||
$stream = NULL;
|
||||
}
|
||||
}
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class TesseractNotFoundException extends TesseractOcrException
|
||||
{
|
||||
}
|
181
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOCR.php
vendored
Normal file
181
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOCR.php
vendored
Normal file
|
@ -0,0 +1,181 @@
|
|||
<?php namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
use thiagoalessio\TesseractOCR\Command;
|
||||
use thiagoalessio\TesseractOCR\Option;
|
||||
use thiagoalessio\TesseractOCR\FriendlyErrors;
|
||||
|
||||
class TesseractOCR
|
||||
{
|
||||
public $command;
|
||||
private $outputFile = null;
|
||||
|
||||
public function __construct($image=null, $command=null)
|
||||
{
|
||||
$this->command = $command ?: new Command;
|
||||
$this->image("$image");
|
||||
}
|
||||
|
||||
public function run($timeout = 0)
|
||||
{
|
||||
try {
|
||||
if ($this->outputFile !== null) {
|
||||
FriendlyErrors::checkWritePermissions($this->outputFile);
|
||||
$this->command->useFileAsOutput = true;
|
||||
}
|
||||
|
||||
FriendlyErrors::checkTesseractPresence($this->command->executable);
|
||||
if ($this->command->useFileAsInput) {
|
||||
FriendlyErrors::checkImagePath($this->command->image);
|
||||
}
|
||||
|
||||
$process = new Process("{$this->command}");
|
||||
|
||||
if (!$this->command->useFileAsInput) {
|
||||
$process->write($this->command->image, $this->command->imageSize);
|
||||
$process->closeStdin();
|
||||
}
|
||||
$output = $process->wait($timeout);
|
||||
|
||||
FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
|
||||
}
|
||||
catch (TesseractOcrException $e) {
|
||||
if ($this->command->useFileAsOutput) $this->cleanTempFiles();
|
||||
throw $e;
|
||||
}
|
||||
|
||||
if ($this->command->useFileAsOutput) {
|
||||
$text = file_get_contents($this->command->getOutputFile());
|
||||
|
||||
if ($this->outputFile !== null) {
|
||||
rename($this->command->getOutputFile(), $this->outputFile);
|
||||
}
|
||||
|
||||
$this->cleanTempFiles();
|
||||
}
|
||||
else
|
||||
$text = $output["out"];
|
||||
|
||||
return trim($text, " \t\n\r\0\x0A\x0B\x0C");
|
||||
}
|
||||
|
||||
public function imageData($image, $size)
|
||||
{
|
||||
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
|
||||
$this->command->useFileAsInput = false;
|
||||
$this->command->image = $image;
|
||||
$this->command->imageSize = $size;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function withoutTempFiles()
|
||||
{
|
||||
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
|
||||
$this->command->useFileAsOutput = false;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function image($image)
|
||||
{
|
||||
$this->command->image = $image;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function executable($executable)
|
||||
{
|
||||
FriendlyErrors::checkTesseractPresence($executable);
|
||||
$this->command->executable = $executable;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function configFile($configFile)
|
||||
{
|
||||
$this->command->configFile = $configFile;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function tempDir($tempDir)
|
||||
{
|
||||
$this->command->tempDir = $tempDir;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function threadLimit($limit)
|
||||
{
|
||||
$this->command->threadLimit = $limit;
|
||||
return $this;
|
||||
}
|
||||
|
||||
// @deprecated
|
||||
public function format($fmt) { return $this->configFile($fmt); }
|
||||
|
||||
public function setOutputFile($path) {
|
||||
$this->outputFile = $path;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function allowlist()
|
||||
{
|
||||
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
|
||||
$allowlist = join('', array_map($concat, func_get_args()));
|
||||
$this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function whitelist()
|
||||
{
|
||||
$warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
|
||||
trigger_error($warningMsg, E_USER_NOTICE);
|
||||
|
||||
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
|
||||
$allowlist = join('', array_map($concat, func_get_args()));
|
||||
return $this->allowlist($allowlist);
|
||||
}
|
||||
|
||||
public function version()
|
||||
{
|
||||
return $this->command->getTesseractVersion();
|
||||
}
|
||||
|
||||
public function availableLanguages()
|
||||
{
|
||||
return $this->command->getAvailableLanguages();
|
||||
}
|
||||
|
||||
public function __call($method, $args)
|
||||
{
|
||||
if ($this->isConfigFile($method)) return $this->configFile($method);
|
||||
if ($this->isOption($method)) {
|
||||
$option = $this->getOptionClassName().'::'.$method;
|
||||
$this->command->options[] = call_user_func_array($option, $args);
|
||||
return $this;
|
||||
}
|
||||
$arg = empty($args) ? null : $args[0];
|
||||
$this->command->options[] = Option::config($method, $arg);
|
||||
return $this;
|
||||
}
|
||||
|
||||
private function isConfigFile($name)
|
||||
{
|
||||
return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
|
||||
}
|
||||
|
||||
private function isOption($name)
|
||||
{
|
||||
return in_array($name, get_class_methods($this->getOptionClassName()));
|
||||
}
|
||||
|
||||
private function getOptionClassName()
|
||||
{
|
||||
return __NAMESPACE__.'\\Option';
|
||||
}
|
||||
|
||||
private function cleanTempFiles()
|
||||
{
|
||||
if (file_exists($this->command->getOutputFile(false))) {
|
||||
unlink($this->command->getOutputFile(false));
|
||||
}
|
||||
if (file_exists($this->command->getOutputFile(true))) {
|
||||
unlink($this->command->getOutputFile(true));
|
||||
}
|
||||
}
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
abstract class TesseractOcrException extends \Exception
|
||||
{
|
||||
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/UnsuccessfulCommandException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/UnsuccessfulCommandException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace thiagoalessio\TesseractOCR;
|
||||
|
||||
class UnsuccessfulCommandException extends TesseractOcrException
|
||||
{
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue