From ccab5ae312ec68f2c7f74a8aa24b5afaae2bd019 Mon Sep 17 00:00:00 2001
From: ashfame
Date: Wed, 27 Nov 2024 21:49:41 +0400
Subject: [PATCH 1/4] define crawler with backend as queue storage
---
.eslintrc | 12 +-
src/crawler/crawler.ts | 261 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 1 deletion(-)
create mode 100644 src/crawler/crawler.ts
diff --git a/.eslintrc b/.eslintrc
index 497e93d7..307637d5 100644
--- a/.eslintrc
+++ b/.eslintrc
@@ -22,5 +22,15 @@
"no-console": [
"off"
]
- }
+ },
+ "overrides": [
+ {
+ "files": [
+ "src/crawler/**/*.ts"
+ ],
+ "rules": {
+ "react/no-is-mounted": "off"
+ }
+ }
+ ]
}
diff --git a/src/crawler/crawler.ts b/src/crawler/crawler.ts
new file mode 100644
index 00000000..e1f705a2
--- /dev/null
+++ b/src/crawler/crawler.ts
@@ -0,0 +1,261 @@
+import { CommandTypes, sendCommandToContent } from '@/bus/Command';
+
+interface CrawlerState {
+ isActive: boolean;
+ nextProcessTime: number;
+ rateLimit: number;
+}
+
+interface QueueUrlsResponse {
+ accepted: number;
+ rejected: number;
+ queueSize: number;
+ crawledCount: number;
+}
+
+interface NextUrlResponse {
+ url: string;
+}
+
+interface QueueUrlsRequest {
+ urls: string[];
+ sourceUrl: string;
+}
+
+class Crawler {
+ private readonly state: CrawlerState;
+ private process: ( html: string ) => Promise< void >;
+
+ constructor() {
+ this.state = {
+ isActive: false,
+ nextProcessTime: 0,
+ rateLimit: 1.0, // pages per sec; 1.0 means 1000ms delay between delays
+ };
+ // Initialize with empty process function
+ this.process = async () => {};
+ }
+
+ private log( level: 'log' | 'warn' | 'error', ...args: any[] ): void {
+ console[ level ]( ...args );
+ }
+
+ // Allow setting the process function
+ public setProcessFunction(
+ processFn: ( html: string ) => Promise< void >
+ ): void {
+ this.process = processFn;
+ }
+
+ public async start(): Promise< void > {
+ if ( this.state.isActive ) {
+ this.log( 'log', 'Crawler already running' );
+ return;
+ }
+
+ this.state.isActive = true;
+ this.log( 'log', 'Crawler started' );
+
+ while ( this.state.isActive ) {
+ const next = await this.getNextUrl();
+ if ( next ) {
+ await this.processUrl( next );
+ } else {
+ this.state.isActive = false;
+ this.log( 'log', 'Crawler finished' );
+ }
+ }
+ }
+
+ private async processUrl( url: string ): Promise< void > {
+ this.log( 'log', 'processing url', url );
+ try {
+ // Wait until we're allowed to process the next URL
+ await this.waitForRateLimit();
+
+ await this.navigateToUrl( url );
+
+ // @TODO: Get the HTML content via bus?
+ const html = document.documentElement.outerHTML;
+
+ // Process the page content
+ await this.process( html );
+
+ // Extract and queue new URLs
+ const links = this.extractLinks( html );
+ await this.queueUrls( links, url );
+ } catch ( error ) {
+ this.log( 'error', 'Error processing URL', url, error );
+ this.state.isActive = false;
+ }
+ }
+
+ private async waitForRateLimit(): Promise< void > {
+ const now = Date.now();
+ const delayMs = 1000 / this.state.rateLimit; // Convert rate limit to milliseconds between requests
+
+ if ( now < this.state.nextProcessTime ) {
+ await new Promise( ( resolve ) =>
+ setTimeout( resolve, this.state.nextProcessTime - now )
+ );
+ }
+
+ // Calculate next allowed process time using the delay
+ this.state.nextProcessTime = now + delayMs;
+ }
+
+ private extractLinks( htmlString: string ): string[] {
+ // Create a DOM parser instance
+ const parser = new DOMParser();
+
+ // Parse the HTML string into a document
+ const doc = parser.parseFromString( htmlString, 'text/html' );
+
+ // Find all anchor tags
+ const linkElements = doc.querySelectorAll( 'a' );
+
+ // Convert NodeList to Array and extract link data
+ const links = Array.from( linkElements ).map( ( link ) => {
+ // Get the href attribute
+ const href = link.getAttribute( 'href' );
+
+ // Skip if no href, or it's a javascript: link or anchor link
+ if (
+ ! href ||
+ href.startsWith( 'javascript:' ) ||
+ href.startsWith( '#' )
+ ) {
+ return null;
+ }
+
+ // Try to resolve relative URLs to absolute
+ let absoluteUrl;
+ try {
+ absoluteUrl = new URL( href, window.location.origin ).href;
+ } catch ( e ) {
+ // If URL parsing fails, use the original href
+ absoluteUrl = href;
+ }
+
+ const isExternal = link.hostname !== window.location.hostname;
+ if ( isExternal ) {
+ return null;
+ }
+
+ return absoluteUrl;
+ } );
+
+ // Filter out null values and return unique links
+ return links
+ .filter( ( link ) => link !== null )
+ .filter(
+ ( link, index, self ) =>
+ index === self.findIndex( ( l ) => l === link )
+ );
+ }
+
+ private async queueUrls(
+ urls: string[],
+ sourceUrl: string,
+ retryCount = 0,
+ maxRetries = 5
+ ): Promise< QueueUrlsResponse > {
+ const request: QueueUrlsRequest = {
+ urls,
+ sourceUrl,
+ };
+
+ const response = await fetch( '/crawl-api/queue-urls', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify( request ),
+ } );
+
+ if ( ! response.ok ) {
+ this.log(
+ 'warn',
+ `Attempt ${
+ retryCount + 1
+ }/${ maxRetries } failed: HTTP error! status: ${
+ response.status
+ }`
+ );
+
+ if ( retryCount >= maxRetries - 1 ) {
+ return Promise.reject(
+ new Error(
+ `Failed to queue URLs after ${ maxRetries } attempts`
+ )
+ );
+ }
+
+ // Wait before retrying
+ await this.sleep();
+
+ // Recursive call
+ return this.queueUrls( urls, sourceUrl, retryCount++, maxRetries );
+ }
+
+ return response.json();
+ }
+
+ private async sleep( ms: number = 1000 ): Promise< void > {
+ return new Promise( ( resolve ) => setTimeout( resolve, ms ) );
+ }
+
+ private async getNextUrl(
+ retryCount = 0,
+ maxRetries = 5
+ ): Promise< string | null > {
+ const response = await fetch( '/crawl-api/next-url' );
+
+ // crawling queue is finished
+ if ( response.status === 204 ) {
+ return null;
+ }
+
+ if ( ! response.ok ) {
+ this.log(
+ 'warn',
+ `Attempt ${
+ retryCount + 1
+ }/${ maxRetries } failed: HTTP error! status: ${
+ response.status
+ }`
+ );
+
+ if ( retryCount >= maxRetries - 1 ) {
+ return Promise.reject(
+ new Error(
+ `Failed to get next URL after ${ maxRetries } attempts`
+ )
+ );
+ }
+
+ // Wait before retrying
+ await this.sleep();
+
+ // Recursive call
+ return this.getNextUrl( retryCount++, maxRetries );
+ }
+
+ const data: NextUrlResponse = await response.json();
+ return data.url;
+ }
+
+ private async navigateToUrl( url: string ): Promise< void > {
+ void sendCommandToContent( {
+ type: CommandTypes.NavigateTo,
+ payload: { url },
+ } );
+ }
+
+ public stop(): void {
+ this.state.isActive = false;
+ }
+
+ public updateRateLimit( newLimit: number ): void {
+ // only allow between 0.1 and 10 pages per second - no reason for this limit; feel free to change
+ this.state.rateLimit = Math.max( 0.1, Math.min( 10.0, newLimit ) );
+ }
+}
From 695edaa4cf96b63b506ee822572b429721e3ad42 Mon Sep 17 00:00:00 2001
From: ashfame
Date: Thu, 28 Nov 2024 09:46:26 +0400
Subject: [PATCH 2/4] define post type for storing crawler queue
---
EXTEND.md | 2 +-
src/plugin/class-engine.php | 13 ++--
src/plugin/class-post-type-ui.php | 67 +++++++++++++++++----
src/plugin/class-storage.php | 98 ++++++++++++++++++++++---------
tests/plugin/test-storage.php | 3 +-
tests/plugin/test-transformer.php | 2 +-
6 files changed, 137 insertions(+), 48 deletions(-)
diff --git a/EXTEND.md b/EXTEND.md
index 5a3689d2..81917d87 100644
--- a/EXTEND.md
+++ b/EXTEND.md
@@ -15,7 +15,7 @@ the future or any third-party plugin to transform the data upon installation of
## Storage Architecture
Try WordPress stores all liberated data in a custom post type called `liberated_data`, exposed via a constant:
-`\DotOrg\TryWordPress\Engine::STORAGE_POST_TYPE`.
+`\DotOrg\TryWordPress\Engine::LIBERATED_DATA_POST_TYPE`.
We maintain references between the source data and transformed output using two post meta keys:
diff --git a/src/plugin/class-engine.php b/src/plugin/class-engine.php
index ee648c88..46041639 100644
--- a/src/plugin/class-engine.php
+++ b/src/plugin/class-engine.php
@@ -4,7 +4,8 @@
class Engine {
- public const string STORAGE_POST_TYPE = 'liberated_data';
+ public const string LIBERATED_DATA_POST_TYPE = 'liberated_data';
+ public const string CRAWLER_DATA_POST_TYPE = 'dl_crawler_url';
public function __construct() {
require 'enum-subject-type.php';
@@ -21,15 +22,15 @@ public function __construct() {
( function () {
$transformer = new Transformer();
- new Post_Type_UI( self::STORAGE_POST_TYPE, $transformer );
+ new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE, $transformer );
// REST API
- new Blogpost_Controller( self::STORAGE_POST_TYPE );
- new Page_Controller( self::STORAGE_POST_TYPE );
+ new Blogpost_Controller( self::LIBERATED_DATA_POST_TYPE );
+ new Page_Controller( self::LIBERATED_DATA_POST_TYPE );
- new Storage( self::STORAGE_POST_TYPE );
+ new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
- Subject_Repo::init( self::STORAGE_POST_TYPE );
+ Subject_Repo::init( self::LIBERATED_DATA_POST_TYPE );
} )();
}
}
diff --git a/src/plugin/class-post-type-ui.php b/src/plugin/class-post-type-ui.php
index a607791f..2e790616 100644
--- a/src/plugin/class-post-type-ui.php
+++ b/src/plugin/class-post-type-ui.php
@@ -3,20 +3,30 @@
namespace DotOrg\TryWordPress;
class Post_Type_UI {
- private string $post_type;
+ private string $liberated_data_post_type;
+ private string $crawler_data_post_type;
+
private Transformer $transformer;
- public function __construct( $custom_post_type, Transformer $transformer ) {
- $this->post_type = $custom_post_type;
- $this->transformer = $transformer;
+ public function __construct( string $liberated_data_post_type, string $crawler_data_post_type, Transformer $transformer ) {
+ $this->liberated_data_post_type = $liberated_data_post_type;
+ $this->crawler_data_post_type = $crawler_data_post_type;
+ $this->transformer = $transformer;
- $this->remove_add_new_option( $this->post_type );
+ $this->remove_add_new_option( $this->liberated_data_post_type );
+ $this->remove_add_new_option( $this->crawler_data_post_type );
// Strip editor to be barebones.
add_filter(
'wp_editor_settings',
function ( $settings, $editor_id ) {
- if ( 'content' === $editor_id && get_current_screen()->post_type === $this->post_type ) {
+ if (
+ 'content' === $editor_id &&
+ (
+ get_current_screen()->post_type === $this->liberated_data_post_type ||
+ get_current_screen()->post_type === $this->crawler_data_post_type
+ )
+ ) {
$settings['tinymce'] = false;
$settings['quicktags'] = false;
$settings['media_buttons'] = false;
@@ -37,15 +47,23 @@ function () {
$cpt_screen = false;
if ( 'post-new.php' === $pagenow ) { // New post screen
// @phpcs:ignore WordPress.Security.NonceVerification.Recommended
- if ( isset( $_GET['post_type'] ) && $_GET['post_type'] === $this->post_type ) {
+ // @phpcs:disable
+ if (
+ isset( $_GET['post_type'] ) &&
+ (
+ $_GET['post_type'] === $this->liberated_data_post_type ||
+ $_GET['post_type'] === $this->crawler_data_post_type
+ )
+ ) {
$cpt_screen = true;
}
+ // @phpcs:enable
}
if ( 'post.php' === $pagenow ) { // Edit post screen
// @phpcs:ignore WordPress.Security.NonceVerification.Recommended, WordPress.Security.ValidatedSanitizedInput.InputNotValidated
$post_type = get_post_type( absint( $_GET['post'] ) );
- if ( $post_type === $this->post_type ) {
+ if ( $post_type === $this->liberated_data_post_type || $post_type === $this->crawler_data_post_type ) {
$cpt_screen = true;
}
}
@@ -61,7 +79,7 @@ function () {
add_filter(
'use_block_editor_for_post_type',
function ( $use_block_editor, $post_type ) {
- if ( $post_type === $this->post_type ) {
+ if ( $post_type === $this->liberated_data_post_type || $post_type === $this->crawler_data_post_type ) {
return false;
}
@@ -76,8 +94,11 @@ function ( $use_block_editor, $post_type ) {
'add_meta_boxes',
function () {
// Remove default meta boxes
- remove_meta_box( 'submitdiv', $this->post_type, 'side' );
- remove_meta_box( 'slugdiv', $this->post_type, 'normal' );
+ remove_meta_box( 'submitdiv', $this->liberated_data_post_type, 'side' );
+ remove_meta_box( 'slugdiv', $this->liberated_data_post_type, 'normal' );
+
+ remove_meta_box( 'submitdiv', $this->crawler_data_post_type, 'side' );
+
/**
* We would need to remove more metaboxes as their support is added to CPTs.
* Leaving code here for reference.
@@ -116,10 +137,32 @@ function () {
echo "This post hasn't been transformed yet.
";
}
},
- $this->post_type,
+ $this->liberated_data_post_type,
'side',
'default'
);
+
+ add_meta_box(
+ 'discovered_crawler_url',
+ 'Discovered URL',
+ function () {
+ global $post;
+ ?>
+
+
+
+
+ Status:
+
post_status ) ); ?>
+
+ crawler_data_post_type,
+ 'advanced',
+ 'default'
+ );
},
999
);
diff --git a/src/plugin/class-storage.php b/src/plugin/class-storage.php
index 94a1e07f..6fd4023c 100644
--- a/src/plugin/class-storage.php
+++ b/src/plugin/class-storage.php
@@ -3,44 +3,88 @@
namespace DotOrg\TryWordPress;
class Storage {
- private string $post_type;
- private string $post_type_name;
+ private string $liberated_data_post_type;
+ private string $liberated_data_post_type_name;
+ private string $crawler_data_post_type;
+ private string $crawler_data_post_type_name = 'Crawler URL';
+ private string $crawler_data_post_type_name_plural = 'Crawler URLs';
private array $custom_post_types_supports = array( 'title', 'editor', 'custom-fields' );
- public function __construct( string $post_type ) {
- $this->post_type = $post_type;
- $this->post_type_name = ucwords( str_replace( '_', ' ', $post_type ) );
+ public function __construct( string $liberated_data_post_type, string $crawler_data_post_type ) {
+ $this->liberated_data_post_type = $liberated_data_post_type;
+ $this->liberated_data_post_type_name = ucwords( str_replace( '_', ' ', $liberated_data_post_type ) );
+ $this->crawler_data_post_type = $crawler_data_post_type;
add_action( 'init', array( $this, 'register_post_types' ) );
}
- private function get_singular_name(): string {
- return $this->post_type_name;
- }
+ public function register_post_types(): void {
+ register_post_type(
+ $this->liberated_data_post_type,
+ array(
+ 'public' => false,
+ 'exclude_from_search' => true,
+ 'publicly_queryable' => false,
+ 'show_in_rest' => true,
+ 'show_ui' => true,
+ 'show_in_menu' => WP_DEBUG,
+ 'menu_icon' => 'dashicons-database',
+ 'supports' => $this->custom_post_types_supports,
+ 'labels' => $this->get_post_type_registration_labels(
+ $this->liberated_data_post_type_name,
+ $this->liberated_data_post_type_name
+ ),
+ 'rest_base' => $this->liberated_data_post_type,
+ )
+ );
- private function get_plural_name(): string {
- return $this->post_type_name;
- }
+ register_post_type(
+ $this->crawler_data_post_type,
+ array(
+ 'public' => false,
+ 'exclude_from_search' => true,
+ 'publicly_queryable' => false,
+ 'show_in_rest' => false,
+ 'show_ui' => true,
+ 'show_in_menu' => WP_DEBUG,
+ 'menu_icon' => 'dashicons-editor-ul',
+ 'supports' => array( '' ), // has to be empty string array, otherwise title and content support comes in by default
+ 'labels' => $this->get_post_type_registration_labels(
+ $this->crawler_data_post_type_name,
+ $this->crawler_data_post_type_name_plural
+ ),
+ 'rest_base' => $this->crawler_data_post_type,
+ )
+ );
- public function register_post_types(): void {
- $name = $this->get_singular_name();
- $name_plural = $this->get_plural_name();
-
- $args = array(
- 'public' => false,
- 'exclude_from_search' => true,
- 'publicly_queryable' => false,
- 'show_in_rest' => true,
- 'show_ui' => true,
- 'show_in_menu' => WP_DEBUG,
- 'menu_icon' => 'dashicons-database',
- 'supports' => $this->custom_post_types_supports,
- 'labels' => $this->get_post_type_registration_labels( $name, $name_plural ),
- 'rest_base' => $this->post_type,
+ register_post_status(
+ 'discovered',
+ array(
+ 'label' => _x( 'Discovered', 'post status', 'try_wordpress' ),
+ 'public' => false,
+ 'exclude_from_search' => true,
+ 'show_in_admin_all_list' => true,
+ 'show_in_admin_status_list' => true,
+ 'internal' => true,
+ // translators: %s: Number of discovered posts
+ 'label_count' => _n_noop( 'Discovered (%s)', 'Discovered (%s)', 'try_wordpress' ),
+ )
);
- register_post_type( $this->post_type, $args );
+ register_post_status(
+ 'crawled',
+ array(
+ 'label' => _x( 'Crawled', 'post status', 'try_wordpress' ),
+ 'public' => false,
+ 'exclude_from_search' => true,
+ 'show_in_admin_all_list' => true,
+ 'show_in_admin_status_list' => true,
+ 'internal' => true,
+ // translators: %s: Number of crawled posts
+ 'label_count' => _n_noop( 'Crawled (%s)', 'Crawled (%s)', 'try_wordpress' ),
+ )
+ );
}
public function get_post_type_registration_labels( string $name, string $name_plural ): array {
diff --git a/tests/plugin/test-storage.php b/tests/plugin/test-storage.php
index 5b496b83..6b534eb5 100644
--- a/tests/plugin/test-storage.php
+++ b/tests/plugin/test-storage.php
@@ -8,11 +8,12 @@ class Storage_Test extends TestCase {
protected function setUp(): void {
parent::setUp();
- $this->storage = new Storage( 'lib_x' );
+ $this->storage = new Storage( 'lib_x', 'lib_crawl' );
}
public function testRegisterPostTypes(): void {
do_action( 'init' );
$this->assertTrue( post_type_exists( 'lib_x' ), 'Custom post type meant for storage not registered' );
+ $this->assertTrue( post_type_exists( 'lib_crawl' ), 'Custom post type meant for storage not registered' );
}
}
diff --git a/tests/plugin/test-transformer.php b/tests/plugin/test-transformer.php
index cc3ae347..edb18ad3 100644
--- a/tests/plugin/test-transformer.php
+++ b/tests/plugin/test-transformer.php
@@ -23,7 +23,7 @@ protected function setUp(): void {
'post_status' => 'draft',
'post_content_filtered' => '',
'guid' => 'https://example.com/x',
- 'post_type' => Engine::STORAGE_POST_TYPE,
+ 'post_type' => 'lib_x',
)
);
update_post_meta( $this->post_id_in_db, 'subject_type', SubjectType::BLOGPOST->value );
From 9a15f48fa3395996f0919f188a7c2fe19a4ebe0c Mon Sep 17 00:00:00 2001
From: ashfame
Date: Fri, 29 Nov 2024 13:13:18 +0400
Subject: [PATCH 3/4] define Controller Registry to manage instances of all
REST API controllers
---
src/plugin/class-controller-registry.php | 11 +++++++++++
src/plugin/class-engine.php | 5 ++---
2 files changed, 13 insertions(+), 3 deletions(-)
create mode 100644 src/plugin/class-controller-registry.php
diff --git a/src/plugin/class-controller-registry.php b/src/plugin/class-controller-registry.php
new file mode 100644
index 00000000..e8cbcb2e
--- /dev/null
+++ b/src/plugin/class-controller-registry.php
@@ -0,0 +1,11 @@
+
Date: Mon, 2 Dec 2024 12:50:08 +0400
Subject: [PATCH 4/4] wip
---
src/plugin/class-controller-registry.php | 23 ++-
src/plugin/class-crawler-controller.php | 200 +++++++++++++++++++++++
src/plugin/class-engine.php | 9 +-
tests/plugin/base-test.php | 10 ++
tests/plugin/test-crawler-controller.php | 64 ++++++++
5 files changed, 301 insertions(+), 5 deletions(-)
create mode 100644 src/plugin/class-crawler-controller.php
create mode 100644 tests/plugin/test-crawler-controller.php
diff --git a/src/plugin/class-controller-registry.php b/src/plugin/class-controller-registry.php
index e8cbcb2e..6a7cd64f 100644
--- a/src/plugin/class-controller-registry.php
+++ b/src/plugin/class-controller-registry.php
@@ -4,8 +4,29 @@
class Controller_Registry {
- public function __construct( string $liberated_data_post_type, string $crawler_data_post_type ) {
+ public function __construct( string $liberated_data_post_type, string $crawler_queue_post_type ) {
new Blogpost_Controller( $liberated_data_post_type );
new Page_Controller( $liberated_data_post_type );
+
+ $domain = $this->infer_domain( $liberated_data_post_type );
+
+ new Crawler_Controller( $domain, $crawler_queue_post_type );
+ }
+
+ private function infer_domain( $liberated_data_post_type ): string {
+ $liberated_posts = get_posts(
+ array(
+ 'post_type' => $liberated_data_post_type,
+ 'posts_per_page' => 1,
+ 'post_status' => 'draft',
+ )
+ );
+
+ if ( ! empty( $liberated_posts ) ) {
+ $domain = wp_parse_url( $liberated_posts[0]->guid, -1 );
+ return $domain['scheme'] . '://' . $domain['host'];
+ }
+
+ return '';
}
}
diff --git a/src/plugin/class-crawler-controller.php b/src/plugin/class-crawler-controller.php
new file mode 100644
index 00000000..697b91f6
--- /dev/null
+++ b/src/plugin/class-crawler-controller.php
@@ -0,0 +1,200 @@
+domain = $domain;
+ $this->crawler_queue_post_type = $crawler_queue_post_type;
+
+ add_action( 'rest_api_init', array( $this, 'register_routes' ) );
+ }
+
+ public function register_routes(): void {
+ $version = '1';
+ $namespace = 'try-wp/v' . $version;
+ register_rest_route(
+ $namespace,
+ '/crawler/next',
+ array(
+ array(
+ 'methods' => WP_REST_Server::READABLE,
+ 'callback' => array( $this, 'get_next_url' ),
+ 'permission_callback' => '__return_true',
+ 'args' => array(
+ 'context' => array(
+ 'default' => 'view',
+ ),
+ ),
+ ),
+ )
+ );
+ register_rest_route(
+ $namespace,
+ '/crawler/queue',
+ array(
+ array(
+ 'methods' => WP_REST_Server::READABLE,
+ 'callback' => array( $this, 'queue_urls' ),
+ 'permission_callback' => '__return_true',
+ // @TODO Specify args here so that sanitization is handled automatically
+ 'args' => array(
+ 'context' => array(
+ 'default' => 'view',
+ ),
+ ),
+ ),
+ )
+ );
+ }
+
+ public function get_next_url( $request ): WP_REST_Response|WP_Error {
+ $ready_to_crawl_urls = get_posts(
+ array(
+ 'post_type' => $this->crawler_queue_post_type,
+ 'posts_per_page' => 1,
+ 'post_status' => 'discovered',
+ 'orderby' => 'date',
+ 'order' => 'ASC',
+ )
+ );
+
+ if ( empty( $ready_to_crawl_urls ) ) {
+ // have we finished crawling or haven't even started yet?
+ $crawled_urls = get_posts(
+ array(
+ 'post_type' => $this->crawler_queue_post_type,
+ 'posts_per_page' => 1,
+ 'post_status' => 'crawled',
+ 'orderby' => 'date',
+ 'order' => 'ASC',
+ )
+ );
+
+ if ( empty( $crawled_urls ) ) {
+ // we haven't begun, so return domain itself
+ return new WP_REST_Response( $this->domain );
+ }
+
+ return new WP_REST_Response( null, 204 );
+ }
+
+ return new WP_REST_Response( $ready_to_crawl_urls[0]->guid );
+ }
+
+ public function queue_urls( $request ): WP_REST_Response|WP_Error {
+ $request_data = json_decode( $request->get_body(), true );
+
+ if ( empty( $request_data['sourceUrl'] ) ) {
+ return new WP_REST_Response( null, 400 );
+ }
+
+ $post_id = $this->get_post_id_by_guid( $request_data['sourceUrl'] );
+ if ( empty( $post_id ) ) {
+ return new WP_REST_Response( null, 404 );
+ }
+
+ $source_url = sanitize_url( $request_data['sourceUrl'] );
+ $marked = $this->mark_url_as_crawled( $source_url );
+ if ( is_wp_error( $marked ) ) {
+ return $marked;
+ }
+
+ foreach ( $request_data['urls']as $url ) {
+ $queued_result = $this->queue_url( $url );
+ if ( is_wp_error( $queued_result ) ) {
+ return $queued_result;
+ }
+ }
+
+ return new WP_REST_Response();
+ }
+
+ private function queue_url( string $url ): true|WP_Error {
+ $post_id = $this->get_post_id_by_guid( $url );
+
+ // insert only if it's not present
+ if ( empty( $post_id ) ) {
+ $inserted_post_id = wp_insert_post(
+ array(
+ 'post_type' => $this->crawler_queue_post_type,
+ 'guid' => sanitize_url( $url ),
+ ),
+ true
+ );
+
+ if ( is_wp_error( $inserted_post_id ) ) {
+ return $inserted_post_id;
+ }
+
+ return true;
+ }
+
+ return true;
+ }
+
+ private function mark_url_as_crawled( $url ): true|WP_Error {
+ $post_id = $this->get_post_id_by_guid( $url );
+ $post = get_post( $post_id );
+ $post->post_status = 'crawled';
+ if ( wp_update_post( $post ) === $post->ID ) {
+ return true;
+ }
+
+ return new WP_Error(
+ 'rest_save_failed',
+ __( 'Failed to update url as crawled', 'try_wordpress' ),
+ array( 'status' => 500 )
+ );
+ }
+
+ public function get_post_id_by_guid( string $guid ): ?int {
+ // Use wp_cache_* for guid -> postId
+ $cache_group = 'try_wp';
+ $cache_key = 'try_wp_crawler_cache_guid_' . md5( $guid );
+ $post_id = wp_cache_get( $cache_key, $cache_group );
+
+ if ( false !== $post_id ) {
+ // Cache hit - get post using WordPress API
+ $post = get_post( $post_id );
+ if ( $post ) {
+ return (int) $post_id;
+ }
+ // If post not found despite cache hit, delete the cache
+ wp_cache_delete( $cache_key, $cache_group );
+ }
+
+ // Cache miss - query database
+ global $wpdb;
+ // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery
+ $post_id = $wpdb->get_var(
+ $wpdb->prepare(
+ "SELECT ID FROM $wpdb->posts WHERE guid = %s",
+ $guid
+ )
+ );
+
+ if ( $post_id ) {
+ // Cache the post ID for future lookups
+ wp_cache_set( $cache_key, $post_id, $cache_group, YEAR_IN_SECONDS );
+ return (int) $post_id;
+ }
+
+ return null;
+ }
+}
diff --git a/src/plugin/class-engine.php b/src/plugin/class-engine.php
index c0efa7e7..a47d2fa2 100644
--- a/src/plugin/class-engine.php
+++ b/src/plugin/class-engine.php
@@ -5,7 +5,7 @@
class Engine {
public const string LIBERATED_DATA_POST_TYPE = 'liberated_data';
- public const string CRAWLER_DATA_POST_TYPE = 'dl_crawler_url';
+ public const string CRAWLER_QUEUE_POST_TYPE = 'dl_crawler_url';
public function __construct() {
require 'enum-subject-type.php';
@@ -15,6 +15,7 @@ public function __construct() {
require 'class-liberate-controller.php';
require 'class-blogpost-controller.php';
require 'class-page-controller.php';
+ require 'class-crawler-controller.php';
require 'class-controller-registry.php';
require 'class-storage.php';
require 'class-subject.php';
@@ -23,11 +24,11 @@ public function __construct() {
( function () {
$transformer = new Transformer();
- new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE, $transformer );
+ new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE, $transformer );
- new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
+ new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );
- new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
+ new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );
Subject_Repo::init( self::LIBERATED_DATA_POST_TYPE );
} )();
diff --git a/tests/plugin/base-test.php b/tests/plugin/base-test.php
index 97822b79..8743ad33 100644
--- a/tests/plugin/base-test.php
+++ b/tests/plugin/base-test.php
@@ -2,3 +2,13 @@
/**
* Setup for running tests would come here.
*/
+
+// for crawler controller
+wp_insert_post(
+ array(
+ 'post_type' => \DotOrg\TryWordPress\Engine::LIBERATED_DATA_POST_TYPE,
+ 'title' => 'something to avoid empty filter',
+ 'guid' => 'https://example.org/1',
+ 'post_status' => 'draft',
+ )
+);
diff --git a/tests/plugin/test-crawler-controller.php b/tests/plugin/test-crawler-controller.php
new file mode 100644
index 00000000..0e49eb67
--- /dev/null
+++ b/tests/plugin/test-crawler-controller.php
@@ -0,0 +1,64 @@
+endpoint = '/' . $this->namespace . '/crawler';
+
+ // Note: `base-test.php` sets a `liberated_data` post
+
+ $this->crawler_controller = new Crawler_Controller(
+ $this->domain,
+ $this->crawler_queue_post_type
+ );
+ }
+
+ public function testRegisterRoutes(): void {
+ // do_action( 'rest_api_init' ); // so that register_route() executes.
+
+ $routes = rest_get_server()->get_routes( $this->namespace );
+ $this->assertArrayHasKey( $this->endpoint . '/next', $routes );
+ $this->assertArrayHasKey( $this->endpoint . '/queue', $routes );
+ }
+
+ /**
+ * @group failing
+ */
+ public function testGetNextUrlWithoutQueue(): void {
+ // first fetch should return the domain itself since that's the first url to crawl
+ $request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
+ $response = rest_do_request( $request );
+
+ $this->assertEquals( 200, $response->get_status() );
+ $this->assertEquals( $this->domain, $response->get_data() );
+ }
+
+ public function testQueueUrls(): void {
+ // first fetch should return the domain itself since that's the first url to crawl
+ $request = new WP_REST_Request( 'GET', $this->endpoint . '/queue' );
+ $response = rest_do_request( $request );
+
+ $this->assertEquals( 200, $response->get_status() );
+ $this->assertEquals( $this->domain, $response->get_data() );
+ }
+
+ public function testGetNextUrlFromQueue(): void {
+ // first fetch should return the domain itself since that's the first url to crawl
+ $request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
+ $response = rest_do_request( $request );
+
+ $this->assertEquals( 200, $response->get_status() );
+ $this->assertEquals( $this->domain, $response->get_data() );
+ }
+}