Rename botdetection to blockbot

Adding composer/vendor to blockbot
This commit is contained in:
Philipp Holzer 2019-04-20 20:38:32 +02:00
parent f1839f23e6
commit 34fc60be77
No known key found for this signature in database
GPG key ID: 517BE60E2CE5C8A5
27 changed files with 3768 additions and 7 deletions

View file

@ -0,0 +1,193 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect;
use Jaybizzle\CrawlerDetect\Fixtures\Crawlers;
use Jaybizzle\CrawlerDetect\Fixtures\Exclusions;
use Jaybizzle\CrawlerDetect\Fixtures\Headers;
class CrawlerDetect
{
/**
* The user agent.
*
* @var null
*/
protected $userAgent = null;
/**
* Headers that contain a user agent.
*
* @var array
*/
protected $httpHeaders = array();
/**
* Store regex matches.
*
* @var array
*/
protected $matches = array();
/**
* Crawlers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Crawlers
*/
protected $crawlers;
/**
* Exclusions object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Exclusions
*/
protected $exclusions;
/**
* Headers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Headers
*/
protected $uaHttpHeaders;
/**
* The compiled regex string.
*
* @var string
*/
protected $compiledRegex;
/**
* The compiled exclusions regex string.
*
* @var string
*/
protected $compiledExclusions;
/**
* Class constructor.
*/
public function __construct(array $headers = null, $userAgent = null)
{
$this->crawlers = new Crawlers();
$this->exclusions = new Exclusions();
$this->uaHttpHeaders = new Headers();
$this->compiledRegex = $this->compileRegex($this->crawlers->getAll());
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());
$this->setHttpHeaders($headers);
$this->setUserAgent($userAgent);
}
/**
* Compile the regex patterns into one regex string.
*
* @param array
*
* @return string
*/
public function compileRegex($patterns)
{
return '('.implode('|', $patterns).')';
}
/**
* Set HTTP headers.
*
* @param array|null $httpHeaders
*/
public function setHttpHeaders($httpHeaders)
{
// Use global _SERVER if $httpHeaders aren't defined.
if (! is_array($httpHeaders) || ! count($httpHeaders)) {
$httpHeaders = $_SERVER;
}
// Clear existing headers.
$this->httpHeaders = array();
// Only save HTTP headers. In PHP land, that means
// only _SERVER vars that start with HTTP_.
foreach ($httpHeaders as $key => $value) {
if (strpos($key, 'HTTP_') === 0) {
$this->httpHeaders[$key] = $value;
}
}
}
/**
* Return user agent headers.
*
* @return array
*/
public function getUaHttpHeaders()
{
return $this->uaHttpHeaders->getAll();
}
/**
* Set the user agent.
*
* @param string $userAgent
*/
public function setUserAgent($userAgent)
{
if (is_null($userAgent)) {
foreach ($this->getUaHttpHeaders() as $altHeader) {
if (isset($this->httpHeaders[$altHeader])) {
$userAgent .= $this->httpHeaders[$altHeader].' ';
}
}
}
return $this->userAgent = $userAgent;
}
/**
* Check user agent string against the regex.
*
* @param string|null $userAgent
*
* @return bool
*/
public function isCrawler($userAgent = null)
{
$agent = trim(preg_replace(
"/{$this->compiledExclusions}/i",
'',
$userAgent ?: $this->userAgent
));
if ($agent == '') {
return false;
}
$result = preg_match("/{$this->compiledRegex}/i", $agent, $matches);
if ($matches) {
$this->matches = $matches;
}
return (bool) $result;
}
/**
* Return the matches.
*
* @return string|null
*/
public function getMatches()
{
return isset($this->matches[0]) ? $this->matches[0] : null;
}
}

View file

@ -0,0 +1,32 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
abstract class AbstractProvider
{
/**
* The data set.
*
* @var array
*/
protected $data;
/**
* Return the data set.
*
* @return array
*/
public function getAll()
{
return $this->data;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,72 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Exclusions extends AbstractProvider
{
/**
* List of strings to remove from the user agent before running the crawler regex
* Over a large list of user agents, this gives us about a 55% speed increase!
*
* @var array
*/
protected $data = array(
'Safari.[\d\.]*',
'Firefox.[\d\.]*',
' Chrome.[\d\.]*',
'Chromium.[\d\.]*',
'MSIE.[\d\.]',
'Opera\/[\d\.]*',
'Mozilla.[\d\.]*',
'AppleWebKit.[\d\.]*',
'Trident.[\d\.]*',
'Windows NT.[\d\.]*',
'Android [\d\.]*',
'Macintosh.',
'Ubuntu',
'Linux',
'[ ]Intel',
'Mac OS X [\d_]*',
'(like )?Gecko(.[\d\.]*)?',
'KHTML,',
'CriOS.[\d\.]*',
'CPU iPhone OS ([0-9_])* like Mac OS X',
'CPU OS ([0-9_])* like Mac OS X',
'iPod',
'compatible',
'x86_..',
'i686',
'x64',
'X11',
'rv:[\d\.]*',
'Version.[\d\.]*',
'WOW64',
'Win64',
'Dalvik.[\d\.]*',
' \.NET CLR [\d\.]*',
'Presto.[\d\.]*',
'Media Center PC',
'BlackBerry',
'Build',
'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.',
'Opera',
' \.NET[\d\.]*',
'cubot',
'; M bot',
'; CRONO',
'; B bot',
'; IDbot',
'; ID bot',
'; POWER BOT',
';', // Remove the following characters ;
);
}

View file

@ -0,0 +1,37 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Headers extends AbstractProvider
{
/**
* All possible HTTP headers that represent the user agent string.
*
* @var array
*/
protected $data = array(
// The default User-Agent string.
'HTTP_USER_AGENT',
// Header can occur on devices using Opera Mini.
'HTTP_X_OPERAMINI_PHONE_UA',
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
'HTTP_X_DEVICE_USER_AGENT',
'HTTP_X_ORIGINAL_USER_AGENT',
'HTTP_X_SKYFIRE_PHONE',
'HTTP_X_BOLT_PHONE_UA',
'HTTP_DEVICE_STOCK_UA',
'HTTP_X_UCBROWSER_DEVICE_UA',
// Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
'HTTP_FROM',
'HTTP_X_SCANNER', // Seen in use by Netsparker
);
}