One Hat Cyber Team
Your IP:
216.73.216.102
Server IP:
198.54.114.155
Server:
Linux server71.web-hosting.com 4.18.0-513.18.1.lve.el8.x86_64 #1 SMP Thu Feb 22 12:55:50 UTC 2024 x86_64
Server Software:
LiteSpeed
PHP Version:
5.6.40
Create File
|
Create Folder
Execute
Dir :
~
/
home
/
fluxyjvi
/
public_html
/
assets
/
images
/
Edit File:
crawler.tar
src/CrawlObservers/CrawlObserver.php 0000644 00000002534 15107470151 0013574 0 ustar 00 <?php namespace Spatie\Crawler\CrawlObservers; use GuzzleHttp\Exception\RequestException; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\UriInterface; abstract class CrawlObserver { /** * Called when the crawler will crawl the url. * * @param \Psr\Http\Message\UriInterface $url */ public function willCrawl(UriInterface $url): void { } /** * Called when the crawler has crawled the given url successfully. * * @param \Psr\Http\Message\UriInterface $url * @param \Psr\Http\Message\ResponseInterface $response * @param \Psr\Http\Message\UriInterface|null $foundOnUrl */ abstract public function crawled( UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null ): void; /** * Called when the crawler had a problem crawling the given url. * * @param \Psr\Http\Message\UriInterface $url * @param \GuzzleHttp\Exception\RequestException $requestException * @param \Psr\Http\Message\UriInterface|null $foundOnUrl */ abstract public function crawlFailed( UriInterface $url, RequestException $requestException, ?UriInterface $foundOnUrl = null ): void; /** * Called when the crawl has ended. */ public function finishedCrawling(): void { } } src/CrawlObservers/CrawlObserverCollection.php 0000644 00000004132 15107470151 0015604 0 ustar 00 <?php namespace Spatie\Crawler\CrawlObservers; use ArrayAccess; use GuzzleHttp\Exception\RequestException; use Iterator; use Psr\Http\Message\ResponseInterface; use Spatie\Crawler\CrawlUrl; class CrawlObserverCollection implements ArrayAccess, Iterator { protected int $position; public function __construct(protected array $observers = []) { $this->position = 0; } public function addObserver(CrawlObserver $observer): void { $this->observers[] = $observer; } public function crawled(CrawlUrl $crawlUrl, ResponseInterface $response): void { foreach ($this->observers as $crawlObserver) { $crawlObserver->crawled( $crawlUrl->url, $response, $crawlUrl->foundOnUrl ); } } public function crawlFailed(CrawlUrl $crawlUrl, RequestException $exception): void { foreach ($this->observers as $crawlObserver) { $crawlObserver->crawlFailed( $crawlUrl->url, $exception, $crawlUrl->foundOnUrl ); } } public function current(): mixed { return $this->observers[$this->position]; } public function offsetGet(mixed $offset): mixed { return $this->observers[$offset] ?? null; } public function offsetSet(mixed $offset, mixed $value): void { if (is_null($offset)) { $this->observers[] = $value; } else { $this->observers[$offset] = $value; } } public function offsetExists(mixed $offset): bool { return isset($this->observers[$offset]); } public function offsetUnset(mixed $offset): void { unset($this->observers[$offset]); } public function next(): void { $this->position++; } public function key(): mixed { return $this->position; } public function valid(): bool { return isset($this->observers[$this->position]); } public function rewind(): void { $this->position = 0; } } src/Exceptions/InvalidCrawlRequestHandler.php 0000644 00000000562 15107470151 0015417 0 ustar 00 <?php namespace Spatie\Crawler\Exceptions; use RuntimeException; class InvalidCrawlRequestHandler extends RuntimeException { public static function doesNotExtendBaseClass(string $handlerClass, string $baseClass): static { return new static("`{$handlerClass} is not a valid handler class. A valid handler class should extend `{$baseClass}`."); } } src/Exceptions/InvalidUrl.php 0000644 00000001056 15107470152 0012242 0 ustar 00 <?php namespace Spatie\Crawler\Exceptions; use Exception; use Psr\Http\Message\UriInterface; use Spatie\Crawler\CrawlUrl; class InvalidUrl extends Exception { public static function unexpectedType(mixed $url): static { $crawlUrlClass = CrawlUrl::class; $uriInterfaceClass = UriInterface::class; $givenUrlClass = is_object($url) ? get_class($url) : gettype($url); return new static("You passed an invalid url of type `{$givenUrlClass}`. This should be either a {$crawlUrlClass} or `{$uriInterfaceClass}`"); } } src/Exceptions/UrlNotFoundByIndex.php 0000644 00000000172 15107470152 0013671 0 ustar 00 <?php namespace Spatie\Crawler\Exceptions; use RuntimeException; class UrlNotFoundByIndex extends RuntimeException { } src/ResponseWithCachedBody.php 0000644 00000001336 15107470152 0012411 0 ustar 00 <?php namespace Spatie\Crawler; use GuzzleHttp\Psr7\Response; use Psr\Http\Message\ResponseInterface; class ResponseWithCachedBody extends Response { protected ?string $cachedBody = null; public static function fromGuzzlePsr7Response(ResponseInterface $response): static { return new static( $response->getStatusCode(), $response->getHeaders(), $response->getBody(), $response->getProtocolVersion(), $response->getReasonPhrase() ); } public function setCachedBody(?string $body = null): void { $this->cachedBody = $body; } public function getCachedBody(): ?string { return $this->cachedBody; } } src/CrawlProfiles/CrawlProfile.php 0000644 00000000267 15107470152 0013220 0 ustar 00 <?php namespace Spatie\Crawler\CrawlProfiles; use Psr\Http\Message\UriInterface; abstract class CrawlProfile { abstract public function shouldCrawl(UriInterface $url): bool; } src/CrawlProfiles/CrawlAllUrls.php 0000644 00000000332 15107470152 0013167 0 ustar 00 <?php namespace Spatie\Crawler\CrawlProfiles; use Psr\Http\Message\UriInterface; class CrawlAllUrls extends CrawlProfile { public function shouldCrawl(UriInterface $url): bool { return true; } } src/CrawlProfiles/CrawlInternalUrls.php 0000644 00000001001 15107470153 0014226 0 ustar 00 <?php namespace Spatie\Crawler\CrawlProfiles; use GuzzleHttp\Psr7\Uri; use Psr\Http\Message\UriInterface; class CrawlInternalUrls extends CrawlProfile { protected mixed $baseUrl; public function __construct($baseUrl) { if (! $baseUrl instanceof UriInterface) { $baseUrl = new Uri($baseUrl); } $this->baseUrl = $baseUrl; } public function shouldCrawl(UriInterface $url): bool { return $this->baseUrl->getHost() === $url->getHost(); } } src/CrawlProfiles/CrawlSubdomains.php 0000644 00000001206 15107470153 0013717 0 ustar 00 <?php namespace Spatie\Crawler\CrawlProfiles; use GuzzleHttp\Psr7\Uri; use Psr\Http\Message\UriInterface; class CrawlSubdomains extends CrawlProfile { protected mixed $baseUrl; public function __construct($baseUrl) { if (! $baseUrl instanceof UriInterface) { $baseUrl = new Uri($baseUrl); } $this->baseUrl = $baseUrl; } public function shouldCrawl(UriInterface $url): bool { return $this->isSubdomainOfHost($url); } public function isSubdomainOfHost(UriInterface $url): bool { return str_ends_with($url->getHost(), $this->baseUrl->getHost()); } } src/LinkAdder.php 0000644 00000006351 15107470153 0007711 0 ustar 00 <?php namespace Spatie\Crawler; use GuzzleHttp\Psr7\Uri; use Illuminate\Support\Collection; use InvalidArgumentException; use Psr\Http\Message\UriInterface; use Symfony\Component\DomCrawler\Crawler as DomCrawler; use Symfony\Component\DomCrawler\Link; use Tree\Node\Node; class LinkAdder { protected Crawler $crawler; public function __construct(Crawler $crawler) { $this->crawler = $crawler; } public function addFromHtml(string $html, UriInterface $foundOnUrl): void { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); collect($allLinks) ->filter(fn (UriInterface $url) => $this->hasCrawlableScheme($url)) ->map(fn (UriInterface $url) => $this->normalizeUrl($url)) ->filter(function (UriInterface $url) use ($foundOnUrl) { if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { return false; } return $this->shouldCrawl($node); }) ->filter(fn (UriInterface $url) => ! str_contains($url->getPath(), '/tel:')) ->each(function (UriInterface $url) use ($foundOnUrl) { $crawlUrl = CrawlUrl::create($url, $foundOnUrl); $this->crawler->addToCrawlQueue($crawlUrl); }); } protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection { $domCrawler = new DomCrawler($html, $foundOnUrl); return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links()) ->reject(function (Link $link) { if ($this->isInvalidHrefNode($link)) { return true; } if ($this->crawler->mustRejectNofollowLinks() && $link->getNode()->getAttribute('rel') === 'nofollow') { return true; } return false; }) ->map(function (Link $link) { try { return new Uri($link->getUri()); } catch (InvalidArgumentException $exception) { return; } }) ->filter(); } protected function hasCrawlableScheme(UriInterface $uri): bool { return in_array($uri->getScheme(), ['http', 'https']); } protected function normalizeUrl(UriInterface $url): UriInterface { return $url->withFragment(''); } protected function shouldCrawl(Node $node): bool { if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) { return false; } $maximumDepth = $this->crawler->getMaximumDepth(); if (is_null($maximumDepth)) { return true; } return $node->getDepth() <= $maximumDepth; } protected function isInvalidHrefNode(Link $link): bool { if ($link->getNode()->nodeName !== 'a') { return false; } if ($link->getNode()->nextSibling !== null) { return false; } if ($link->getNode()->childNodes->length !== 0) { return false; } return true; } } src/Handlers/CrawlRequestFulfilled.php 0000644 00000010242 15107470153 0014056 0 ustar 00 <?php namespace Spatie\Crawler\Handlers; use Exception; use GuzzleHttp\Psr7\Uri; use GuzzleHttp\Psr7\Utils; use GuzzleHttp\RedirectMiddleware; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\StreamInterface; use Psr\Http\Message\UriInterface; use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlerRobots; use Spatie\Crawler\CrawlProfiles\CrawlSubdomains; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\LinkAdder; use Spatie\Crawler\ResponseWithCachedBody; class CrawlRequestFulfilled { protected LinkAdder $linkAdder; public function __construct(protected Crawler $crawler) { $this->linkAdder = new LinkAdder($this->crawler); } public function __invoke(ResponseInterface $response, $index) { $body = $this->getBody($response); $robots = new CrawlerRobots( $response->getHeaders(), $body, $this->crawler->mustRespectRobots() ); $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); if ($this->crawler->mayExecuteJavaScript()) { $body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url); $response = $response->withBody(Utils::streamFor($body)); } $responseWithCachedBody = ResponseWithCachedBody::fromGuzzlePsr7Response($response); $responseWithCachedBody->setCachedBody($body); if ($robots->mayIndex()) { $this->handleCrawled($responseWithCachedBody, $crawlUrl); } if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) { if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) { return; } } if (! $robots->mayFollow()) { return; } $baseUrl = $this->getBaseUrl($response, $crawlUrl); $this->linkAdder->addFromHtml($body, $baseUrl); usleep($this->crawler->getDelayBetweenRequests()); } protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl): Uri { $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); if (empty($redirectHistory)) { return $crawlUrl->url; } return new Uri(end($redirectHistory)); } protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl): void { $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response); } protected function getBody(ResponseInterface $response): string { $contentType = $response->getHeaderLine('Content-Type'); if (! $this->isMimetypeAllowedToParse($contentType)) { return ''; } return $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); } protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string { if ($bodyStream->isSeekable()) { $bodyStream->rewind(); } $body = ''; $chunksToRead = $readMaximumBytes < 512 ? $readMaximumBytes : 512; for ($bytesRead = 0; $bytesRead < $readMaximumBytes; $bytesRead += $chunksToRead) { try { $newDataRead = $bodyStream->read($chunksToRead); } catch (Exception $exception) { $newDataRead = null; } if (! $newDataRead) { break; } $body .= $newDataRead; } return $body; } protected function getBodyAfterExecutingJavaScript(UriInterface $url): string { $browsershot = $this->crawler->getBrowsershot(); $html = $browsershot->setUrl((string) $url)->bodyHtml(); return html_entity_decode($html); } protected function isMimetypeAllowedToParse($contentType): bool { if (empty($contentType)) { return true; } if (! count($this->crawler->getParseableMimeTypes())) { return true; } foreach ($this->crawler->getParseableMimeTypes() as $allowedType) { if (stristr($contentType, $allowedType)) { return true; } } return false; } } src/Handlers/CrawlRequestFailed.php 0000644 00000001450 15107470153 0013335 0 ustar 00 <?php namespace Spatie\Crawler\Handlers; use Exception; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Exception\RequestException; use Spatie\Crawler\Crawler; class CrawlRequestFailed { public function __construct(protected Crawler $crawler) { // } public function __invoke(Exception $exception, $index) { if ($exception instanceof ConnectException) { $exception = new RequestException($exception->getMessage(), $exception->getRequest()); } if ($exception instanceof RequestException) { $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); $this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception); } usleep($this->crawler->getDelayBetweenRequests()); } } src/error_log 0000644 00000001224 15107470154 0007253 0 ustar 00 [19-Nov-2025 03:23:03 UTC] PHP Fatal error: Uncaught Error: Class "GuzzleHttp\Psr7\Response" not found in /home/fluxyjvi/public_html/project/vendor/spatie/crawler/src/ResponseWithCachedBody.php:8 Stack trace: #0 {main} thrown in /home/fluxyjvi/public_html/project/vendor/spatie/crawler/src/ResponseWithCachedBody.php on line 8 [19-Nov-2025 10:13:30 UTC] PHP Fatal error: Uncaught Error: Class "GuzzleHttp\Psr7\Response" not found in /home/fluxyjvi/public_html/project/vendor/spatie/crawler/src/ResponseWithCachedBody.php:8 Stack trace: #0 {main} thrown in /home/fluxyjvi/public_html/project/vendor/spatie/crawler/src/ResponseWithCachedBody.php on line 8 src/CrawlUrl.php 0000644 00000001372 15107470154 0007606 0 ustar 00 <?php namespace Spatie\Crawler; use Psr\Http\Message\UriInterface; class CrawlUrl { public UriInterface $url; public ?UriInterface $foundOnUrl = null; protected mixed $id; public static function create(UriInterface $url, ?UriInterface $foundOnUrl = null, $id = null): static { $static = new static($url, $foundOnUrl); if ($id !== null) { $static->setId($id); } return $static; } protected function __construct(UriInterface $url, $foundOnUrl = null) { $this->url = $url; $this->foundOnUrl = $foundOnUrl; } public function getId(): mixed { return $this->id; } public function setId($id): void { $this->id = $id; } } src/CrawlQueues/ArrayCrawlQueue.php 0000644 00000004512 15107470154 0013366 0 ustar 00 <?php namespace Spatie\Crawler\CrawlQueues; use Psr\Http\Message\UriInterface; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\Exceptions\InvalidUrl; use Spatie\Crawler\Exceptions\UrlNotFoundByIndex; class ArrayCrawlQueue implements CrawlQueue { /** * All known URLs, indexed by URL string. * * @var CrawlUrl[] */ protected array $urls = []; /** * Pending URLs, indexed by URL string. * * @var CrawlUrl[] */ protected array $pendingUrls = []; public function add(CrawlUrl $crawlUrl): CrawlQueue { $urlString = (string) $crawlUrl->url; if (! isset($this->urls[$urlString])) { $crawlUrl->setId($urlString); $this->urls[$urlString] = $crawlUrl; $this->pendingUrls[$urlString] = $crawlUrl; } return $this; } public function hasPendingUrls(): bool { return (bool) $this->pendingUrls; } public function getUrlById($id): CrawlUrl { if (! isset($this->urls[$id])) { throw new UrlNotFoundByIndex("Crawl url {$id} not found in collection."); } return $this->urls[$id]; } public function hasAlreadyBeenProcessed(CrawlUrl $crawlUrl): bool { $urlString = (string) $crawlUrl->url; if (isset($this->pendingUrls[$urlString])) { return false; } if (isset($this->urls[$urlString])) { return true; } return false; } public function markAsProcessed(CrawlUrl $crawlUrl): void { $urlString = (string) $crawlUrl->url; unset($this->pendingUrls[$urlString]); } public function getProcessedUrlCount(): int { return count($this->urls) - count($this->pendingUrls); } public function has(CrawlUrl | UriInterface $crawlUrl): bool { if ($crawlUrl instanceof CrawlUrl) { $urlString = (string) $crawlUrl->url; } elseif ($crawlUrl instanceof UriInterface) { $urlString = (string) $crawlUrl; } else { throw InvalidUrl::unexpectedType($crawlUrl); } return isset($this->urls[$urlString]); } public function getPendingUrl(): ?CrawlUrl { foreach ($this->pendingUrls as $pendingUrl) { return $pendingUrl; } return null; } } src/CrawlQueues/CrawlQueue.php 0000644 00000001074 15107470154 0012367 0 ustar 00 <?php namespace Spatie\Crawler\CrawlQueues; use Psr\Http\Message\UriInterface; use Spatie\Crawler\CrawlUrl; interface CrawlQueue { public function add(CrawlUrl $url): self; public function has(CrawlUrl | UriInterface $crawlUrl): bool; public function hasPendingUrls(): bool; public function getUrlById($id): CrawlUrl; public function getPendingUrl(): ?CrawlUrl; public function hasAlreadyBeenProcessed(CrawlUrl $url): bool; public function markAsProcessed(CrawlUrl $crawlUrl): void; public function getProcessedUrlCount(): int; } src/Crawler.php 0000644 00000032344 15107470154 0007455 0 ustar 00 <?php namespace Spatie\Crawler; use Generator; use GuzzleHttp\Client; use GuzzleHttp\Pool; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Uri; use GuzzleHttp\RequestOptions; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; use Spatie\Crawler\CrawlObservers\CrawlObserver; use Spatie\Crawler\CrawlObservers\CrawlObserverCollection; use Spatie\Crawler\CrawlProfiles\CrawlAllUrls; use Spatie\Crawler\CrawlProfiles\CrawlProfile; use Spatie\Crawler\CrawlQueues\ArrayCrawlQueue; use Spatie\Crawler\CrawlQueues\CrawlQueue; use Spatie\Crawler\Exceptions\InvalidCrawlRequestHandler; use Spatie\Crawler\Handlers\CrawlRequestFailed; use Spatie\Crawler\Handlers\CrawlRequestFulfilled; use Spatie\Robots\RobotsTxt; use Tree\Node\Node; class Crawler { public const DEFAULT_USER_AGENT = '*'; protected UriInterface $baseUrl; protected CrawlObserverCollection $crawlObservers; protected CrawlProfile $crawlProfile; protected CrawlQueue $crawlQueue; protected int $totalUrlCount = 0; protected int $currentUrlCount = 0; protected ?int $totalCrawlLimit = null; protected ?int $currentCrawlLimit = null; protected int $maximumResponseSize = 1024 * 1024 * 2; protected ?int $maximumDepth = null; protected bool $respectRobots = true; protected bool $rejectNofollowLinks = true; protected Node $depthTree; protected bool $executeJavaScript = false; protected ?Browsershot $browsershot = null; protected ?RobotsTxt $robotsTxt = null; protected string $crawlRequestFulfilledClass; protected string $crawlRequestFailedClass; protected int $delayBetweenRequests = 0; protected array $allowedMimeTypes = []; protected string $defaultScheme = 'http'; protected static array $defaultClientOptions = [ RequestOptions::COOKIES => true, RequestOptions::CONNECT_TIMEOUT => 10, RequestOptions::TIMEOUT => 10, RequestOptions::ALLOW_REDIRECTS => false, RequestOptions::HEADERS => [ 'User-Agent' => self::DEFAULT_USER_AGENT, ], ]; public static function create(array $clientOptions = []): static { $clientOptions = (count($clientOptions)) ? $clientOptions : static::$defaultClientOptions; $client = new Client($clientOptions); return new static($client); } public function __construct( protected Client $client, protected int $concurrency = 10, ) { $this->crawlProfile = new CrawlAllUrls(); $this->crawlQueue = new ArrayCrawlQueue(); $this->crawlObservers = new CrawlObserverCollection(); $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class; $this->crawlRequestFailedClass = CrawlRequestFailed::class; } public function getDefaultScheme(): string { return $this->defaultScheme; } public function setDefaultScheme(string $defaultScheme): self { $this->defaultScheme = $defaultScheme; return $this; } public function setConcurrency(int $concurrency): self { $this->concurrency = $concurrency; return $this; } public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self { $this->maximumResponseSize = $maximumResponseSizeInBytes; return $this; } public function getMaximumResponseSize(): ?int { return $this->maximumResponseSize; } public function setTotalCrawlLimit(int $totalCrawlLimit): self { $this->totalCrawlLimit = $totalCrawlLimit; return $this; } public function getTotalCrawlLimit(): ?int { return $this->totalCrawlLimit; } public function getTotalCrawlCount(): int { return $this->totalUrlCount; } public function setCurrentCrawlLimit(int $currentCrawlLimit): self { $this->currentCrawlLimit = $currentCrawlLimit; return $this; } public function getCurrentCrawlLimit(): ?int { return $this->currentCrawlLimit; } public function getCurrentCrawlCount(): int { return $this->currentUrlCount; } public function setMaximumDepth(int $maximumDepth): self { $this->maximumDepth = $maximumDepth; return $this; } public function getMaximumDepth(): ?int { return $this->maximumDepth; } public function setDelayBetweenRequests(int $delayInMilliseconds): self { $this->delayBetweenRequests = ($delayInMilliseconds * 1000); return $this; } public function getDelayBetweenRequests(): int { return $this->delayBetweenRequests; } public function setParseableMimeTypes(array $types): self { $this->allowedMimeTypes = $types; return $this; } public function getParseableMimeTypes(): array { return $this->allowedMimeTypes; } public function ignoreRobots(): self { $this->respectRobots = false; return $this; } public function respectRobots(): self { $this->respectRobots = true; return $this; } public function mustRespectRobots(): bool { return $this->respectRobots; } public function acceptNofollowLinks(): self { $this->rejectNofollowLinks = false; return $this; } public function rejectNofollowLinks(): self { $this->rejectNofollowLinks = true; return $this; } public function mustRejectNofollowLinks(): bool { return $this->rejectNofollowLinks; } public function getRobotsTxt(): RobotsTxt { return $this->robotsTxt; } public function setCrawlQueue(CrawlQueue $crawlQueue): self { $this->crawlQueue = $crawlQueue; return $this; } public function getCrawlQueue(): CrawlQueue { return $this->crawlQueue; } public function executeJavaScript(): self { $this->executeJavaScript = true; return $this; } public function doNotExecuteJavaScript(): self { $this->executeJavaScript = false; return $this; } public function mayExecuteJavascript(): bool { return $this->executeJavaScript; } public function setCrawlObserver(CrawlObserver | array $crawlObservers): self { if (! is_array($crawlObservers)) { $crawlObservers = [$crawlObservers]; } return $this->setCrawlObservers($crawlObservers); } public function setCrawlObservers(array $crawlObservers): self { $this->crawlObservers = new CrawlObserverCollection($crawlObservers); return $this; } public function addCrawlObserver(CrawlObserver $crawlObserver): self { $this->crawlObservers->addObserver($crawlObserver); return $this; } public function getCrawlObservers(): CrawlObserverCollection { return $this->crawlObservers; } public function setCrawlProfile(CrawlProfile $crawlProfile): self { $this->crawlProfile = $crawlProfile; return $this; } public function getCrawlProfile(): CrawlProfile { return $this->crawlProfile; } public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): self { $baseClass = CrawlRequestFulfilled::class; if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) { throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass); } $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass; return $this; } public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): self { $baseClass = CrawlRequestFailed::class; if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) { throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass); } $this->crawlRequestFailedClass = $crawlRequestFailedClass; return $this; } public function setBrowsershot(Browsershot $browsershot) { $this->browsershot = $browsershot; return $this; } public function setUserAgent(string $userAgent): self { $clientOptions = $this->client->getConfig(); $headers = array_change_key_case($clientOptions['headers']); $headers['user-agent'] = $userAgent; $clientOptions['headers'] = $headers; $this->client = new Client($clientOptions); return $this; } public function getUserAgent(): string { $headers = $this->client->getConfig('headers'); foreach (array_keys($headers) as $name) { if (strtolower($name) === 'user-agent') { return (string) $headers[$name]; } } return static::DEFAULT_USER_AGENT; } public function getBrowsershot(): Browsershot { if (! $this->browsershot) { $this->browsershot = new Browsershot(); } return $this->browsershot; } public function getBaseUrl(): UriInterface { return $this->baseUrl; } public function startCrawling(UriInterface | string $baseUrl) { if (! $baseUrl instanceof UriInterface) { $baseUrl = new Uri($baseUrl); } if ($baseUrl->getScheme() === '') { $baseUrl = $baseUrl->withScheme($this->defaultScheme); } if ($baseUrl->getPath() === '') { $baseUrl = $baseUrl->withPath('/'); } $this->totalUrlCount = $this->crawlQueue->getProcessedUrlCount(); $this->baseUrl = $baseUrl; $crawlUrl = CrawlUrl::create($this->baseUrl); $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url); if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent()) || ! $this->respectRobots ) { $this->addToCrawlQueue($crawlUrl); } $this->depthTree = new Node((string) $this->baseUrl); $this->startCrawlingQueue(); foreach ($this->crawlObservers as $crawlObserver) { $crawlObserver->finishedCrawling(); } } public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node { if (is_null($this->maximumDepth)) { return new Node((string) $url); } $node = $node ?? $this->depthTree; $returnNode = null; if ($node->getValue() === (string) $parentUrl) { $newNode = new Node((string) $url); $node->addChild($newNode); return $newNode; } foreach ($node->getChildren() as $currentNode) { $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); if (! is_null($returnNode)) { break; } } return $returnNode; } protected function startCrawlingQueue(): void { while ( $this->reachedCrawlLimits() === false && $this->crawlQueue->hasPendingUrls() ) { $pool = new Pool($this->client, $this->getCrawlRequests(), [ 'concurrency' => $this->concurrency, 'options' => $this->client->getConfig(), 'fulfilled' => new $this->crawlRequestFulfilledClass($this), 'rejected' => new $this->crawlRequestFailedClass($this), ]); $promise = $pool->promise(); $promise->wait(); } } protected function createRobotsTxt(UriInterface $uri): RobotsTxt { return RobotsTxt::create($uri->withPath('/robots.txt')); } protected function getCrawlRequests(): Generator { while ( $this->reachedCrawlLimits() === false && $crawlUrl = $this->crawlQueue->getPendingUrl() ) { if ( $this->crawlProfile->shouldCrawl($crawlUrl->url) === false || $this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl) ) { $this->crawlQueue->markAsProcessed($crawlUrl); continue; } foreach ($this->crawlObservers as $crawlObserver) { $crawlObserver->willCrawl($crawlUrl->url); } $this->totalUrlCount++; $this->currentUrlCount++; $this->crawlQueue->markAsProcessed($crawlUrl); yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url); } } public function addToCrawlQueue(CrawlUrl $crawlUrl): self { if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) { return $this; } if ($this->getCrawlQueue()->has($crawlUrl->url)) { return $this; } $this->crawlQueue->add($crawlUrl); return $this; } public function reachedCrawlLimits(): bool { $totalCrawlLimit = $this->getTotalCrawlLimit(); if (! is_null($totalCrawlLimit) && $this->getTotalCrawlCount() >= $totalCrawlLimit) { return true; } $currentCrawlLimit = $this->getCurrentCrawlLimit(); if (! is_null($currentCrawlLimit) && $this->getCurrentCrawlCount() >= $currentCrawlLimit) { return true; } return false; } } src/CrawlerRobots.php 0000644 00000002214 15107470155 0010640 0 ustar 00 <?php namespace Spatie\Crawler; use Spatie\Robots\RobotsHeaders; use Spatie\Robots\RobotsMeta; class CrawlerRobots { protected RobotsHeaders $robotsHeaders; protected RobotsMeta $robotsMeta; protected bool $mustRespectRobots; public function __construct(array $headers, string $body, bool $mustRespectRobots) { $this->robotsHeaders = RobotsHeaders::create($headers); $this->robotsMeta = RobotsMeta::create($body); $this->mustRespectRobots = $mustRespectRobots; } public function mayIndex(): bool { if (! $this->mustRespectRobots) { return true; } if (! $this->robotsHeaders->mayIndex()) { return false; } if (! $this->robotsMeta->mayIndex()) { return false; } return true; } public function mayFollow(): bool { if (! $this->mustRespectRobots) { return true; } if (! $this->robotsHeaders->mayFollow()) { return false; } if (! $this->robotsMeta->mayFollow()) { return false; } return true; } } README.md 0000644 00000037500 15107470155 0006035 0 ustar 00 # 🕸 Crawl the web using PHP 🕷 [](https://packagist.org/packages/spatie/crawler) [](LICENSE.md)   [](https://packagist.org/packages/spatie/crawler) This package provides a class to crawl links on a website. Under the hood Guzzle promises are used to [crawl multiple urls concurrently](http://docs.guzzlephp.org/en/latest/quickstart.html?highlight=pool#concurrent-requests). Because the crawler can execute JavaScript, it can crawl JavaScript rendered sites. Under the hood [Chrome and Puppeteer](https://github.com/spatie/browsershot) are used to power this feature. ## Support us [<img src="https://github-ads.s3.eu-central-1.amazonaws.com/crawler.jpg?t=1" width="419px" />](https://spatie.be/github-ad-click/crawler) We invest a lot of resources into creating [best in class open source packages](https://spatie.be/open-source). You can support us by [buying one of our paid products](https://spatie.be/open-source/support-us). We highly appreciate you sending us a postcard from your hometown, mentioning which of our package(s) you are using. You'll find our address on [our contact page](https://spatie.be/about-us). We publish all received postcards on [our virtual postcard wall](https://spatie.be/open-source/postcards). ## Installation This package can be installed via Composer: ``` bash composer require spatie/crawler ``` ## Usage The crawler can be instantiated like this ```php use Spatie\Crawler\Crawler; Crawler::create() ->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>) ->startCrawling($url); ``` The argument passed to `setCrawlObserver` must be an object that extends the `\Spatie\Crawler\CrawlObservers\CrawlObserver` abstract class: ```php namespace Spatie\Crawler\CrawlObservers; use GuzzleHttp\Exception\RequestException; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\UriInterface; abstract class CrawlObserver { /** * Called when the crawler will crawl the url. * * @param \Psr\Http\Message\UriInterface $url */ public function willCrawl(UriInterface $url): void { } /** * Called when the crawler has crawled the given url successfully. * * @param \Psr\Http\Message\UriInterface $url * @param \Psr\Http\Message\ResponseInterface $response * @param \Psr\Http\Message\UriInterface|null $foundOnUrl */ abstract public function crawled( UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null ): void; /** * Called when the crawler had a problem crawling the given url. * * @param \Psr\Http\Message\UriInterface $url * @param \GuzzleHttp\Exception\RequestException $requestException * @param \Psr\Http\Message\UriInterface|null $foundOnUrl */ abstract public function crawlFailed( UriInterface $url, RequestException $requestException, ?UriInterface $foundOnUrl = null ): void; /** * Called when the crawl has ended. */ public function finishedCrawling(): void { } } ``` ### Using multiple observers You can set multiple observers with `setCrawlObservers`: ```php Crawler::create() ->setCrawlObservers([ <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>, <class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>, ... ]) ->startCrawling($url); ``` Alternatively you can set multiple observers one by one with `addCrawlObserver`: ```php Crawler::create() ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>) ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>) ->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>) ->startCrawling($url); ``` ### Executing JavaScript By default, the crawler will not execute JavaScript. This is how you can enable the execution of JavaScript: ```php Crawler::create() ->executeJavaScript() ... ``` In order to make it possible to get the body html after the javascript has been executed, this package depends on our [Browsershot](https://github.com/spatie/browsershot) package. This package uses [Puppeteer](https://github.com/puppeteer/puppeteer) under the hood. Here are some pointers on [how to install it on your system](https://spatie.be/docs/browsershot/v2/requirements). Browsershot will make an educated guess as to where its dependencies are installed on your system. By default, the Crawler will instantiate a new Browsershot instance. You may find the need to set a custom created instance using the `setBrowsershot(Browsershot $browsershot)` method. ```php Crawler::create() ->setBrowsershot($browsershot) ->executeJavaScript() ... ``` Note that the crawler will still work even if you don't have the system dependencies required by Browsershot. These system dependencies are only required if you're calling `executeJavaScript()`. ### Filtering certain urls You can tell the crawler not to visit certain urls by using the `setCrawlProfile`-function. That function expects an object that extends `Spatie\Crawler\CrawlProfiles\CrawlProfile`: ```php /* * Determine if the given url should be crawled. */ public function shouldCrawl(UriInterface $url): bool; ``` This package comes with three `CrawlProfiles` out of the box: - `CrawlAllUrls`: this profile will crawl all urls on all pages including urls to an external site. - `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host. - `CrawlSubdomains`: this profile will only crawl the internal urls and its subdomains on the pages of a host. ### Ignoring robots.txt and robots meta By default, the crawler will respect robots data. It is possible to disable these checks like so: ```php Crawler::create() ->ignoreRobots() ... ``` Robots data can come from either a `robots.txt` file, meta tags or response headers. More information on the spec can be found here: [http://www.robotstxt.org/](http://www.robotstxt.org/). Parsing robots data is done by our package [spatie/robots-txt](https://github.com/spatie/robots-txt). ### Accept links with rel="nofollow" attribute By default, the crawler will reject all links containing attribute rel="nofollow". It is possible to disable these checks like so: ```php Crawler::create() ->acceptNofollowLinks() ... ``` ### Using a custom User Agent ### In order to respect robots.txt rules for a custom User Agent you can specify your own custom User Agent. ```php Crawler::create() ->setUserAgent('my-agent') ``` You can add your specific crawl rule group for 'my-agent' in robots.txt. This example disallows crawling the entire site for crawlers identified by 'my-agent'. ```txt // Disallow crawling for my-agent User-agent: my-agent Disallow: / ``` ## Setting the number of concurrent requests To improve the speed of the crawl the package concurrently crawls 10 urls by default. If you want to change that number you can use the `setConcurrency` method. ```php Crawler::create() ->setConcurrency(1) // now all urls will be crawled one by one ``` ## Defining Crawl Limits By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment. The crawl behavior can be controlled with the following two options: - **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl. - **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl. Let's take a look at some examples to clarify the difference between these two methods. ### Example 1: Using the total crawl limit The `setTotalCrawlLimit` method allows to limit the total number of URLs to crawl, no matter often you call the crawler. ```php $queue = <your selection/implementation of a queue>; // Crawls 5 URLs and ends. Crawler::create() ->setCrawlQueue($queue) ->setTotalCrawlLimit(5) ->startCrawling($url); // Doesn't crawl further as the total limit is reached. Crawler::create() ->setCrawlQueue($queue) ->setTotalCrawlLimit(5) ->startCrawling($url); ``` ### Example 2: Using the current crawl limit The `setCurrentCrawlLimit` will set a limit on how many URls will be crawled per execution. This piece of code will process 5 pages with each execution, without a total limit of pages to crawl. ```php $queue = <your selection/implementation of a queue>; // Crawls 5 URLs and ends. Crawler::create() ->setCrawlQueue($queue) ->setCurrentCrawlLimit(5) ->startCrawling($url); // Crawls the next 5 URLs and ends. Crawler::create() ->setCrawlQueue($queue) ->setCurrentCrawlLimit(5) ->startCrawling($url); ``` ### Example 3: Combining the total and crawl limit Both limits can be combined to control the crawler: ```php $queue = <your selection/implementation of a queue>; // Crawls 5 URLs and ends. Crawler::create() ->setCrawlQueue($queue) ->setTotalCrawlLimit(10) ->setCurrentCrawlLimit(5) ->startCrawling($url); // Crawls the next 5 URLs and ends. Crawler::create() ->setCrawlQueue($queue) ->setTotalCrawlLimit(10) ->setCurrentCrawlLimit(5) ->startCrawling($url); // Doesn't crawl further as the total limit is reached. Crawler::create() ->setCrawlQueue($queue) ->setTotalCrawlLimit(10) ->setCurrentCrawlLimit(5) ->startCrawling($url); ``` ### Example 4: Crawling across requests You can use the `setCurrentCrawlLimit` to break up long running crawls. The following example demonstrates a (simplified) approach. It's made up of an initial request and any number of follow-up requests continuing the crawl. #### Initial Request To start crawling across different requests, you will need to create a new queue of your selected queue-driver. Start by passing the queue-instance to the crawler. The crawler will start filling the queue as pages are processed and new URLs are discovered. Serialize and store the queue reference after the crawler has finished (using the current crawl limit). ```php // Create a queue using your queue-driver. $queue = <your selection/implementation of a queue>; // Crawl the first set of URLs Crawler::create() ->setCrawlQueue($queue) ->setCurrentCrawlLimit(10) ->startCrawling($url); // Serialize and store your queue $serializedQueue = serialize($queue); ``` #### Subsequent Requests For any following requests you will need to unserialize your original queue and pass it to the crawler: ```php // Unserialize queue $queue = unserialize($serializedQueue); // Crawls the next set of URLs Crawler::create() ->setCrawlQueue($queue) ->setCurrentCrawlLimit(10) ->startCrawling($url); // Serialize and store your queue $serialized_queue = serialize($queue); ``` The behavior is based on the information in the queue. Only if the same queue-instance is passed in the behavior works as described. When a completely new queue is passed in, the limits of previous crawls -even for the same website- won't apply. An example with more details can be found [here](https://github.com/spekulatius/spatie-crawler-cached-queue-example). ## Setting the maximum crawl depth By default, the crawler continues until it has crawled every page of the supplied URL. If you want to limit the depth of the crawler you can use the `setMaximumDepth` method. ```php Crawler::create() ->setMaximumDepth(2) ``` ## Setting the maximum response size Most html pages are quite small. But the crawler could accidentally pick up on large files such as PDFs and MP3s. To keep memory usage low in such cases the crawler will only use the responses that are smaller than 2 MB. If, when streaming a response, it becomes larger than 2 MB, the crawler will stop streaming the response. An empty response body will be assumed. You can change the maximum response size. ```php // let's use a 3 MB maximum. Crawler::create() ->setMaximumResponseSize(1024 * 1024 * 3) ``` ## Add a delay between requests In some cases you might get rate-limited when crawling too aggressively. To circumvent this, you can use the `setDelayBetweenRequests()` method to add a pause between every request. This value is expressed in milliseconds. ```php Crawler::create() ->setDelayBetweenRequests(150) // After every page crawled, the crawler will wait for 150ms ``` ## Limiting which content-types to parse By default, every found page will be downloaded (up to `setMaximumResponseSize()` in size) and parsed for additional links. You can limit which content-types should be downloaded and parsed by setting the `setParseableMimeTypes()` with an array of allowed types. ```php Crawler::create() ->setParseableMimeTypes(['text/html', 'text/plain']) ``` This will prevent downloading the body of pages that have different mime types, like binary files, audio/video, ... that are unlikely to have links embedded in them. This feature mostly saves bandwidth. ## Using a custom crawl queue When crawling a site the crawler will put urls to be crawled in a queue. By default, this queue is stored in memory using the built-in `ArrayCrawlQueue`. When a site is very large you may want to store that queue elsewhere, maybe a database. In such cases, you can write your own crawl queue. A valid crawl queue is any class that implements the `Spatie\Crawler\CrawlQueues\CrawlQueue`-interface. You can pass your custom crawl queue via the `setCrawlQueue` method on the crawler. ```php Crawler::create() ->setCrawlQueue(<implementation of \Spatie\Crawler\CrawlQueues\CrawlQueue>) ``` Here - [ArrayCrawlQueue](https://github.com/spatie/crawler/blob/master/src/CrawlQueues/ArrayCrawlQueue.php) - [RedisCrawlQueue (third-party package)](https://github.com/repat/spatie-crawler-redis) - [CacheCrawlQueue for Laravel (third-party package)](https://github.com/spekulatius/spatie-crawler-toolkit-for-laravel) - [Laravel Model as Queue (third-party example app)](https://github.com/insign/spatie-crawler-queue-with-laravel-model) ## Change the default base url scheme By default, the crawler will set the base url scheme to `http` if none. You have the ability to change that with `setDefaultScheme`. ```php Crawler::create() ->setDefaultScheme('https') ``` ## Changelog Please see [CHANGELOG](CHANGELOG.md) for more information what has changed recently. ## Contributing Please see [CONTRIBUTING](https://github.com/spatie/.github/blob/main/CONTRIBUTING.md) for details. ## Testing First, install the Puppeteer dependency, or your tests will fail. ``` npm install puppeteer ``` To run the tests you'll have to start the included node based server first in a separate terminal window. ```bash cd tests/server npm install node server.js ``` With the server running, you can start testing. ```bash composer test ``` ## Security If you've found a bug regarding security please mail [security@spatie.be](mailto:security@spatie.be) instead of using the issue tracker. ## Postcardware You're free to use this package, but if it makes it to your production environment we highly appreciate you sending us a postcard from your hometown, mentioning which of our package(s) you are using. Our address is: Spatie, Kruikstraat 22, 2018 Antwerp, Belgium. We publish all received postcards [on our company website](https://spatie.be/en/opensource/postcards). ## Credits - [Freek Van der Herten](https://github.com/freekmurze) - [All Contributors](../../contributors) ## License The MIT License (MIT). Please see [License File](LICENSE.md) for more information. composer.json 0000644 00000002336 15107470155 0007277 0 ustar 00 { "name": "spatie/crawler", "description": "Crawl all internal links found on a website", "keywords": [ "spatie", "crawler", "link", "website" ], "homepage": "https://github.com/spatie/crawler", "license": "MIT", "authors": [ { "name": "Freek Van der Herten", "email": "freek@spatie.be" } ], "require": { "php": "^8.0", "guzzlehttp/guzzle": "^7.3", "guzzlehttp/psr7": "^2.0", "illuminate/collections": "^8.38|^9.0|^10.0", "nicmart/tree": "^0.3.0", "spatie/browsershot": "^3.45", "spatie/robots-txt": "^2.0", "symfony/dom-crawler": "^5.2|^6.0" }, "require-dev": { "pestphp/pest": "^1.21", "phpunit/phpunit": "^9.5" }, "config": { "sort-packages": true, "allow-plugins": { "pestphp/pest-plugin": true, "phpstan/extension-installer": true } }, "autoload": { "psr-4": { "Spatie\\Crawler\\": "src" } }, "autoload-dev": { "psr-4": { "Spatie\\Crawler\\Test\\": "tests" } }, "scripts": { "test": "phpunit" } } LICENSE.md 0000644 00000002102 15107470155 0006150 0 ustar 00 The MIT License (MIT) Copyright (c) Spatie bvba <info@spatie.be> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Simpan