Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,70 @@ const CONFIDENCE_MEDIUM = 0.95;
const CONFIDENCE_ABSOLUTE = 1.0;
const DEFAULT_TIMEOUT = 5000;

function analyzeResponse(response) {
/**
* SpaceCat bot identification constants
*/
export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';

/**
* SpaceCat bot IPs by environment
*/
export const SPACECAT_BOT_IPS = {
production: [
'3.218.16.42',
'52.55.82.37',
'54.172.145.38',
],
development: [
'44.218.57.115',
'54.87.205.187',
],
};

/**
* HTML patterns for detecting challenge pages
*/
const CHALLENGE_PATTERNS = {
cloudflare: [
/Checking your browser/i,
/Just a moment\.\.\./i,
/Verifying you are human/i,
/Please wait.*CloudFlare/i,
/cf-turnstile/i,
/challenge-platform/i,
/cf-chl-widget/i, // Cloudflare challenge widget
/ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
/__cf_chl_tk/i, // Cloudflare challenge token
/cloudflare.*security/i,
/attention required.*cloudflare/i,
],
imperva: [
/_Incapsula_Resource/i,
/Incapsula incident ID/i,
/incap_ses/i, // Imperva session cookie
/visid_incap/i, // Imperva visitor ID
],
akamai: [
/Access Denied.*Akamai/i,
/Reference.*Akamai/i,
],
general: [
/captcha/i,
/human verification/i,
/recaptcha/i,
/hcaptcha/i,
/datadome/i,
/dd-request-id/i,
],
};

/**
* Analyzes response for bot protection indicators
* @param {Object} response - Response object with status and headers
* @param {string} [html] - Optional HTML content for deeper analysis
* @returns {Object} Detection result
*/
function analyzeResponse(response, html = null) {
const { status, headers } = response;

// Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
Expand All @@ -45,6 +108,12 @@ function analyzeResponse(response) {
|| headers.get('x-amz-cf-pop')
|| headers.get('via')?.includes('CloudFront');

// Check HTML content for challenge page patterns (if HTML provided)
const htmlHasChallenge = (patterns) => {
if (!html) return false;
return patterns.some((pattern) => pattern.test(html));
};

// Active blocking (403 status with known blocker)
if (status === 403 && hasCloudflare()) {
return {
Expand Down Expand Up @@ -88,6 +157,16 @@ function analyzeResponse(response) {

// Success with known infrastructure present (infrastructure detected but allowing requests)
if (status === 200 && hasCloudflare()) {
// Check if HTML contains challenge page (even though status is 200)
if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
return {
crawlable: false,
type: 'cloudflare',
confidence: CONFIDENCE_HIGH,
reason: 'Challenge page detected despite 200 status',
};
}

return {
crawlable: true,
type: 'cloudflare-allowed',
Expand All @@ -96,6 +175,14 @@ function analyzeResponse(response) {
}

if (status === 200 && hasImperva()) {
if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
return {
crawlable: false,
type: 'imperva',
confidence: CONFIDENCE_HIGH,
reason: 'Challenge page detected despite 200 status',
};
}
return {
crawlable: true,
type: 'imperva-allowed',
Expand All @@ -104,6 +191,14 @@ function analyzeResponse(response) {
}

if (status === 200 && hasAkamai()) {
if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
return {
crawlable: false,
type: 'akamai',
confidence: CONFIDENCE_HIGH,
reason: 'Challenge page detected despite 200 status',
};
}
return {
crawlable: true,
type: 'akamai-allowed',
Expand All @@ -129,6 +224,15 @@ function analyzeResponse(response) {

// Success with no known infrastructure
if (status === 200) {
// Still check for generic challenge patterns
if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
return {
crawlable: false,
type: 'unknown',
confidence: 0.7,
reason: 'Generic challenge patterns detected',
};
}
return {
crawlable: true,
type: 'none',
Expand Down Expand Up @@ -207,3 +311,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
return analyzeError(error);
}
}

/**
* Analyzes already-fetched response data for bot protection.
* Used by content scraper to analyze Puppeteer results without making another request.
*
* @param {Object} data - Response data to analyze
* @param {number} data.status - HTTP status code
* @param {Object} data.headers - Response headers (plain object or Headers object)
* @param {string} [data.html] - Optional HTML content for challenge page detection
* @returns {Object} Detection result (same format as detectBotBlocker)
*/
export function analyzeBotProtection({ status, headers, html }) {
// Convert headers to Headers object if plain object
const headersObj = headers instanceof Headers
? headers
: new Headers(Object.entries(headers || {}));

const mockResponse = {
status,
headers: headersObj,
};

return analyzeResponse(mockResponse, html);
}
7 changes: 6 additions & 1 deletion packages/spacecat-shared-utils/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,12 @@ export * as llmoConfig from './llmo-config.js';
export * as schemas from './schemas.js';

export { detectLocale } from './locale-detect/locale-detect.js';
export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
export {
detectBotBlocker,
analyzeBotProtection,
SPACECAT_BOT_USER_AGENT,
SPACECAT_BOT_IPS,
} from './bot-blocker-detect/bot-blocker-detect.js';
export { prettifyLogForwardingConfig } from './cdn-helpers.js';

export {
Expand Down
Loading