2019-04-20 18:38:32 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This file is part of Crawler Detect - the web crawler detection library.
|
|
|
|
*
|
|
|
|
* (c) Mark Beech <m@rkbee.ch>
|
|
|
|
*
|
|
|
|
* This source file is subject to the MIT license that is bundled
|
|
|
|
* with this source code in the file LICENSE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace Jaybizzle\CrawlerDetect;
|
|
|
|
|
|
|
|
use Jaybizzle\CrawlerDetect\Fixtures\Crawlers;
|
|
|
|
use Jaybizzle\CrawlerDetect\Fixtures\Exclusions;
|
|
|
|
use Jaybizzle\CrawlerDetect\Fixtures\Headers;
|
|
|
|
|
|
|
|
class CrawlerDetect
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* The user agent.
|
|
|
|
*
|
2024-03-20 02:51:17 +00:00
|
|
|
* @var string|null
|
2019-04-20 18:38:32 +00:00
|
|
|
*/
|
2024-03-20 02:51:17 +00:00
|
|
|
protected $userAgent;
|
2019-04-20 18:38:32 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Headers that contain a user agent.
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $httpHeaders = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Store regex matches.
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $matches = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Crawlers object.
|
|
|
|
*
|
|
|
|
* @var \Jaybizzle\CrawlerDetect\Fixtures\Crawlers
|
|
|
|
*/
|
|
|
|
protected $crawlers;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Exclusions object.
|
|
|
|
*
|
|
|
|
* @var \Jaybizzle\CrawlerDetect\Fixtures\Exclusions
|
|
|
|
*/
|
|
|
|
protected $exclusions;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Headers object.
|
|
|
|
*
|
|
|
|
* @var \Jaybizzle\CrawlerDetect\Fixtures\Headers
|
|
|
|
*/
|
|
|
|
protected $uaHttpHeaders;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The compiled regex string.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $compiledRegex;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The compiled exclusions regex string.
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
protected $compiledExclusions;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class constructor.
|
|
|
|
*/
|
|
|
|
public function __construct(array $headers = null, $userAgent = null)
|
|
|
|
{
|
|
|
|
$this->crawlers = new Crawlers();
|
|
|
|
$this->exclusions = new Exclusions();
|
|
|
|
$this->uaHttpHeaders = new Headers();
|
|
|
|
|
|
|
|
$this->compiledRegex = $this->compileRegex($this->crawlers->getAll());
|
|
|
|
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());
|
|
|
|
|
|
|
|
$this->setHttpHeaders($headers);
|
|
|
|
$this->setUserAgent($userAgent);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Compile the regex patterns into one regex string.
|
|
|
|
*
|
|
|
|
* @param array
|
2024-03-20 02:51:17 +00:00
|
|
|
*
|
2019-04-20 18:38:32 +00:00
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function compileRegex($patterns)
|
|
|
|
{
|
|
|
|
return '('.implode('|', $patterns).')';
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set HTTP headers.
|
|
|
|
*
|
|
|
|
* @param array|null $httpHeaders
|
|
|
|
*/
|
|
|
|
public function setHttpHeaders($httpHeaders)
|
|
|
|
{
|
|
|
|
// Use global _SERVER if $httpHeaders aren't defined.
|
|
|
|
if (! is_array($httpHeaders) || ! count($httpHeaders)) {
|
|
|
|
$httpHeaders = $_SERVER;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clear existing headers.
|
|
|
|
$this->httpHeaders = array();
|
|
|
|
|
|
|
|
// Only save HTTP headers. In PHP land, that means
|
|
|
|
// only _SERVER vars that start with HTTP_.
|
|
|
|
foreach ($httpHeaders as $key => $value) {
|
|
|
|
if (strpos($key, 'HTTP_') === 0) {
|
|
|
|
$this->httpHeaders[$key] = $value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return user agent headers.
|
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function getUaHttpHeaders()
|
|
|
|
{
|
|
|
|
return $this->uaHttpHeaders->getAll();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the user agent.
|
|
|
|
*
|
2024-03-20 02:51:17 +00:00
|
|
|
* @param string|null $userAgent
|
2019-04-20 18:38:32 +00:00
|
|
|
*/
|
|
|
|
public function setUserAgent($userAgent)
|
|
|
|
{
|
|
|
|
if (is_null($userAgent)) {
|
|
|
|
foreach ($this->getUaHttpHeaders() as $altHeader) {
|
|
|
|
if (isset($this->httpHeaders[$altHeader])) {
|
|
|
|
$userAgent .= $this->httpHeaders[$altHeader].' ';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $this->userAgent = $userAgent;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check user agent string against the regex.
|
|
|
|
*
|
|
|
|
* @param string|null $userAgent
|
|
|
|
*
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
public function isCrawler($userAgent = null)
|
|
|
|
{
|
|
|
|
$agent = trim(preg_replace(
|
|
|
|
"/{$this->compiledExclusions}/i",
|
|
|
|
'',
|
2024-03-20 02:51:17 +00:00
|
|
|
$userAgent ?: $this->userAgent ?: ''
|
2019-04-20 18:38:32 +00:00
|
|
|
));
|
|
|
|
|
2024-03-20 02:51:17 +00:00
|
|
|
if ($agent === '') {
|
2019-04-20 18:38:32 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-03-20 02:51:17 +00:00
|
|
|
return (bool) preg_match("/{$this->compiledRegex}/i", $agent, $this->matches);
|
2019-04-20 18:38:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the matches.
|
|
|
|
*
|
|
|
|
* @return string|null
|
|
|
|
*/
|
|
|
|
public function getMatches()
|
|
|
|
{
|
|
|
|
return isset($this->matches[0]) ? $this->matches[0] : null;
|
|
|
|
}
|
2024-03-20 02:51:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return string|null
|
|
|
|
*/
|
|
|
|
public function getUserAgent()
|
|
|
|
{
|
|
|
|
return $this->userAgent;
|
|
|
|
}
|
2019-04-20 18:38:32 +00:00
|
|
|
}
|