diff --git a/Command/WikidataCommand.php b/Command/WikidataCommand.php
index 0603026..0d0c304 100644
--- a/Command/WikidataCommand.php
+++ b/Command/WikidataCommand.php
@@ -12,6 +12,7 @@
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
+use Psr\Http\Message\ResponseInterface;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Exception\InvalidArgumentException;
use Symfony\Component\Console\Helper\ProgressBar;
@@ -33,6 +34,18 @@ class WikidataCommand extends AbstractCommand
/** @var string Wikidata item URL. */
protected const URL = 'https://www.wikidata.org/wiki/Special:EntityData/';
+ /** @var int Minimum delay between Wikidata requests in microseconds. */
+ protected const REQUEST_INTERVAL_MICROSECONDS = 200000;
+
+ /** @var int Maximum number of retries after the initial request. */
+ protected const MAX_RETRIES = 4;
+
+ /** @var int Default retry delay in milliseconds when Wikidata does not provide one. */
+ protected const DEFAULT_RETRY_DELAY_MILLISECONDS = 1000;
+
+ /** @var float|null Timestamp of the last outgoing Wikidata request. */
+ private static ?float $lastRequestAt = null;
+
/**
* {@inheritdoc}
*
@@ -127,9 +140,7 @@ protected function execute(InputInterface $input, OutputInterface $output): int
// Download Wikidata item
$path = sprintf('%s/%s.json', $outputDir, $wikidataTag);
- self::save($wikidataTag, $element, $path, $warnings);
-
- if (!file_exists($path) || !is_readable($path)) {
+ if (!self::save($wikidataTag, $element, $path, $warnings)) {
continue;
}
@@ -211,31 +222,30 @@ function ($element): bool {
* @param Element $element OpenStreetMap element (relation/way/node).
* @param string $path Path where to store the result.
* @param string[] $warnings
- * @return void
- *
- * @throws GuzzleException
+ * @return bool True when the file is available locally after the call.
*/
- private static function save(string $identifier, $element, string $path, array &$warnings = []): void
+ private static function save(string $identifier, $element, string $path, array &$warnings = []): bool
{
- if (file_exists($path)) {
- return;
+ if (file_exists($path) && is_readable($path)) {
+ return true;
}
$url = sprintf('%s%s.json', self::URL, $identifier);
$retryMiddleware = Middleware::retry(
function ($retries, $request, $response, $exception) {
- // Stop retrying after 3 attempts
- if ($retries >= 3) {
+ if ($retries >= self::MAX_RETRIES) {
return false;
}
- // Retry on 429 Too Many Requests
if ($response && $response->getStatusCode() === 429) {
return true;
}
return false;
+ },
+ function ($retries, ?ResponseInterface $response = null): int {
+ return self::retryDelayMilliseconds($retries, $response);
}
);
@@ -243,18 +253,22 @@ function ($retries, $request, $response, $exception) {
$stack->push($retryMiddleware);
try {
+ self::throttleRequests();
+
$client = new \GuzzleHttp\Client(['handler' => $stack]);
$client->request('GET', $url, [
'headers' => [
'Accept' => 'application/json',
'User-Agent' => 'EqualStreetNames (+https://equalstreetnames.org)',
],
+ 'connect_timeout' => 10,
'sink' => $path,
+ 'timeout' => 30,
]);
+
+ return true;
} catch (BadResponseException $exception) {
- if (file_exists($path)) {
- unlink($path);
- }
+ self::cleanupPartialDownload($path);
switch ($exception->getResponse()->getStatusCode()) {
case 404:
@@ -264,6 +278,70 @@ function ($retries, $request, $response, $exception) {
$warnings[] = sprintf('Error while fetching Wikidata item %s for %s(%d): %s.', $identifier, $element->type, $element->id, $exception->getMessage());
break;
}
+ } catch (GuzzleException $exception) {
+ self::cleanupPartialDownload($path);
+ $warnings[] = sprintf('Error while fetching Wikidata item %s for %s(%d): %s.', $identifier, $element->type, $element->id, $exception->getMessage());
+ }
+
+ return false;
+ }
+
+ /**
+ * Slow down outbound requests so Wikidata is less likely to rate-limit the process.
+ *
+ * @return void
+ */
+ private static function throttleRequests(): void
+ {
+ if (self::$lastRequestAt !== null) {
+ $elapsedMicroseconds = (int) round((microtime(true) - self::$lastRequestAt) * 1000000);
+ $sleepMicroseconds = self::REQUEST_INTERVAL_MICROSECONDS - $elapsedMicroseconds;
+
+ if ($sleepMicroseconds > 0) {
+ usleep($sleepMicroseconds);
+ }
+ }
+
+ self::$lastRequestAt = microtime(true);
+ }
+
+ /**
+ * Compute retry delay using Wikidata's Retry-After header when available.
+ *
+ * @param int $retries Current retry count.
+ * @param ResponseInterface|null $response
+ * @return int
+ */
+ private static function retryDelayMilliseconds(int $retries, ?ResponseInterface $response = null): int
+ {
+ if ($response !== null) {
+ $retryAfter = $response->getHeaderLine('Retry-After');
+
+ if ($retryAfter !== '') {
+ if (ctype_digit($retryAfter)) {
+ return max((int) $retryAfter * 1000, self::DEFAULT_RETRY_DELAY_MILLISECONDS);
+ }
+
+ $retryAt = strtotime($retryAfter);
+ if ($retryAt !== false) {
+ return max(($retryAt - time()) * 1000, self::DEFAULT_RETRY_DELAY_MILLISECONDS);
+ }
+ }
+ }
+
+ return self::DEFAULT_RETRY_DELAY_MILLISECONDS * $retries;
+ }
+
+ /**
+ * Remove partial files left behind after failed requests.
+ *
+ * @param string $path
+ * @return void
+ */
+ private static function cleanupPartialDownload(string $path): void
+ {
+ if (file_exists($path)) {
+ unlink($path);
}
}
}