Skip to content

Commit 2be553d

Browse files
authored
feat: Enable custom cron schedule (#2809)
Resolves #2786 Fixes #2796 ## Changes - Enables setting and viewing custom cron schedules. - Custom cron option with support for cron macros @hourly, @daily, @Weekly, @monthly, @Yearly - Updated documentation for crontab, with links to crontab.guru
1 parent 6c1c06b commit 2be553d

File tree

10 files changed

+361
-72
lines changed

10 files changed

+361
-72
lines changed

chart/app-templates/crawl_cron_job.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@ spec:
3030
restartPolicy: Never
3131
containers:
3232
- name: noop
33-
image: "docker.io/tianon/true"
33+
image: "docker.io/tianon/true:multiarch"
3434
imagePullPolicy: IfNotPresent

frontend/docs/docs/user-guide/workflow-setup.md

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,6 @@ Automatically start crawls periodically on a daily, weekly, or monthly schedule.
318318

319319
### Crawl Schedule Type
320320

321-
#### Run Immediately on Save
322-
: When selected, the crawl will run immediately as configured. It will not run again unless manually instructed.
323-
324321
#### Run on a Recurring Basis
325322
: When selected, additional configuration options for instructing the system when to run the crawl will be shown. If a crawl is already running when the schedule is set to activate it, the scheduled crawl will not run.
326323

@@ -331,6 +328,26 @@ Automatically start crawls periodically on a daily, weekly, or monthly schedule.
331328

332329
Set how often a scheduled crawl will run.
333330

331+
#### Options
332+
333+
All option support specifying the specific hour and minute the crawl should run.
334+
335+
##### Daily
336+
337+
Run crawl once every day.
338+
339+
##### Weekly
340+
341+
Run crawl once every week.
342+
343+
##### Monthly
344+
345+
Run crawl once every month.
346+
347+
##### Custom
348+
349+
Run crawl at a custom interval, such as hourly or yearly. See [Cron Schedule](#cron-schedule) for details.
350+
334351
### Day
335352

336353
Sets the day of the week for which crawls scheduled with a `Weekly` _Frequency_ will run.
@@ -343,6 +360,33 @@ Sets the date of the month for which crawls scheduled with a `Monthly` _Frequenc
343360

344361
Sets the time that the scheduled crawl will start according to your current timezone.
345362

363+
### Cron Schedule
364+
365+
When using a `Custom` _Frequency_, a custom schedule can be specified by using a Cron expression or supported macros.
366+
367+
Cron expressions should follow the Unix Cron format:
368+
369+
| Position | * | * | * | * | * |
370+
| - | - | - | - | - | - |
371+
| **Description** | minute | hour | day of the month | month | day of the week |
372+
| **Possible Values** | 0 - 59 | 0 - 23 | 1 - 31 | 1 - 12 | 0 - 6<br/>or `sun`, `mon`, `tue`, `wed`, `thu`, `fri`, `sat` |
373+
374+
For example, `0 0 31 12 *` would run a crawl on December 31st every year and `0 0 * * fri` would run a crawl every Friday at midnight.
375+
376+
Additionally, the following macros are supported:
377+
378+
| Value | Description |
379+
| - | - |
380+
| `@yearly` | Run once a year at midnight of 1 January |
381+
| `@monthly` | Run once a month at midnight of the first day of the month |
382+
| `@weekly` | Run once a week at midnight on Sunday |
383+
| `@daily` | Run once a day at midnight |
384+
| `@hourly` | Run once an hour at the beginning of the hour |
385+
386+
You can use a tool like [crontab.guru](https://crontab.guru/) to check Cron syntax validity and view [common expressions](https://crontab.guru/examples.html).
387+
388+
Cron schedules are always in [UTC](https://en.wikipedia.org/wiki/Coordinated_Universal_Time).
389+
346390
## Metadata
347391

348392
Describe and organize your crawl workflow and the resulting archived items.

frontend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"color": "^4.0.1",
3636
"construct-style-sheets-polyfill": "^3.1.0",
3737
"copy-webpack-plugin": "^12.0.2",
38+
"cronstrue": "^3.2.0",
3839
"css-loader": "^6.3.0",
3940
"css-selector-parser": "^3.0.5",
4041
"date-fns": "^3.6.0",

frontend/src/features/crawl-workflows/workflow-editor.ts

Lines changed: 115 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { consume } from "@lit/context";
22
import { localized, msg, str } from "@lit/localize";
33
import type {
44
SlBlurEvent,
5+
SlChangeEvent,
56
SlCheckbox,
67
SlDetails,
78
SlHideEvent,
@@ -105,6 +106,7 @@ import {
105106
getUTCSchedule,
106107
humanizeNextDate,
107108
humanizeSchedule,
109+
validateCron,
108110
} from "@/utils/cron";
109111
import { makeCurrentTargetHandler, stopProp } from "@/utils/events";
110112
import { formValidator, maxLengthValidator } from "@/utils/form";
@@ -267,7 +269,12 @@ export class WorkflowEditor extends BtrixElement {
267269
@property({ type: String })
268270
initialScopeType?: FormState["scopeType"];
269271

270-
@property({ type: Object })
272+
@property({
273+
type: Object,
274+
// Fixes values being reset on navigation events,
275+
// such as when the user guide is open
276+
hasChanged: (a, b) => !isEqual(a, b),
277+
})
271278
initialWorkflow?: WorkflowParams;
272279

273280
private updatingScopeType = false;
@@ -344,7 +351,7 @@ export class WorkflowEditor extends BtrixElement {
344351

345352
private get utcSchedule() {
346353
if (!this.formState.scheduleFrequency) {
347-
return "";
354+
return this.formState.scheduleCustom;
348355
}
349356
return getUTCSchedule({
350357
interval: this.formState.scheduleFrequency,
@@ -2047,19 +2054,48 @@ https://archiveweb.page/images/${"logo.svg"}`}
20472054
}
20482055

20492056
private readonly renderScheduleCron = () => {
2050-
const utcSchedule = this.utcSchedule;
2057+
const scheduledDate = (schedule?: string, opts?: { utc: boolean }) => {
2058+
if (!schedule) return nothing;
2059+
2060+
const humanized = humanizeSchedule(schedule);
2061+
2062+
if (!humanized) return nothing;
2063+
2064+
return html`
2065+
<div class="mt-3 text-xs text-neutral-500">
2066+
<p class="mb-1">
2067+
${msg("Schedule:")}
2068+
<span class="text-blue-500">${humanized}</span>
2069+
</p>
2070+
<p>
2071+
${msg("Next scheduled run:")}
2072+
<span>${humanizeNextDate(schedule, opts)}</span>
2073+
</p>
2074+
</div>
2075+
`;
2076+
};
2077+
2078+
const hourly_macro_code = html`<code>@hourly</code>`;
2079+
const yearly_macro_code = html`<code>@yearly</code>`;
2080+
20512081
return html`
20522082
${this.renderSectionHeading(msg("Set Schedule"))}
20532083
${inputCol(html`
20542084
<sl-select
20552085
name="scheduleFrequency"
20562086
label=${msg("Frequency")}
20572087
value=${this.formState.scheduleFrequency}
2058-
@sl-change=${(e: Event) =>
2088+
@sl-change=${(e: Event) => {
2089+
const scheduleFrequency = (e.target as HTMLSelectElement)
2090+
.value as FormState["scheduleFrequency"];
2091+
20592092
this.updateFormState({
2060-
scheduleFrequency: (e.target as HTMLSelectElement)
2061-
.value as FormState["scheduleFrequency"],
2062-
})}
2093+
scheduleFrequency,
2094+
scheduleCustom: scheduleFrequency
2095+
? ""
2096+
: this.formState.scheduleCustom || "",
2097+
});
2098+
}}
20632099
>
20642100
<sl-option value="daily"
20652101
>${this.scheduleFrequencyLabels["daily"]}</sl-option
@@ -2070,6 +2106,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
20702106
<sl-option value="monthly"
20712107
>${this.scheduleFrequencyLabels["monthly"]}</sl-option
20722108
>
2109+
<sl-divider></sl-divider>
2110+
<sl-option value="">${msg("Custom")}</sl-option>
20732111
</sl-select>
20742112
`)}
20752113
${this.renderHelpTextCol(
@@ -2122,44 +2160,72 @@ https://archiveweb.page/images/${"logo.svg"}`}
21222160
)}
21232161
`,
21242162
)}
2125-
${inputCol(html`
2126-
<btrix-time-input
2127-
hour=${ifDefined(this.formState.scheduleTime?.hour)}
2128-
minute=${ifDefined(this.formState.scheduleTime?.minute)}
2129-
period=${ifDefined(this.formState.scheduleTime?.period)}
2130-
@time-change=${(e: TimeInputChangeEvent) => {
2131-
this.updateFormState({
2132-
scheduleTime: e.detail,
2133-
});
2134-
}}
2135-
>
2136-
<span slot="label">${msg("Start Time")}</span>
2137-
</btrix-time-input>
2138-
<div class="mt-3 text-xs text-neutral-500">
2139-
<p class="mb-1">
2140-
${msg(
2141-
html`Schedule:
2142-
<span class="text-blue-500"
2143-
>${utcSchedule
2144-
? humanizeSchedule(utcSchedule)
2145-
: msg("Invalid date")}</span
2146-
>.`,
2147-
)}
2148-
</p>
2149-
<p>
2163+
${when(
2164+
this.formState.scheduleFrequency,
2165+
() =>
2166+
html`${inputCol(html`
2167+
<btrix-time-input
2168+
hour=${ifDefined(this.formState.scheduleTime?.hour)}
2169+
minute=${ifDefined(this.formState.scheduleTime?.minute)}
2170+
period=${ifDefined(this.formState.scheduleTime?.period)}
2171+
@time-change=${(e: TimeInputChangeEvent) => {
2172+
this.updateFormState({
2173+
scheduleTime: e.detail,
2174+
});
2175+
}}
2176+
>
2177+
<span slot="label">${msg("Start Time")}</span>
2178+
</btrix-time-input>
2179+
${scheduledDate(this.utcSchedule)}
2180+
`)}
2181+
${this.renderHelpTextCol(
2182+
msg(`A crawl will run at this time in your current timezone.`),
2183+
)}`,
2184+
() => html`
2185+
${inputCol(html`
2186+
<sl-input
2187+
name="scheduleCustom"
2188+
label=${msg("Cron Schedule")}
2189+
class="part-[input]:font-mono"
2190+
placeholder="@hourly"
2191+
value=${ifDefined(this.formState.scheduleCustom)}
2192+
minlength="6"
2193+
@sl-change=${(e: SlChangeEvent) => {
2194+
const input = e.target as SlInput;
2195+
const value = (e.target as SlInput).value;
2196+
2197+
if (!value) return;
2198+
2199+
const { valid, error } = validateCron(value);
2200+
2201+
if (valid) {
2202+
input.helpText = "";
2203+
input.setCustomValidity("");
2204+
} else {
2205+
const errorMessage =
2206+
error ?? msg("Please fix invalid Cron expression syntax.");
2207+
2208+
input.helpText = errorMessage;
2209+
input.setCustomValidity(errorMessage);
2210+
}
2211+
}}
2212+
required
2213+
>
2214+
</sl-input>
2215+
${scheduledDate(this.formState.scheduleCustom)}
2216+
`)}
2217+
${this.renderHelpTextCol(html`
2218+
${msg("Specify a schedule in Cron format.")}
21502219
${msg(
2151-
html`Next scheduled run:
2152-
<span
2153-
>${utcSchedule
2154-
? humanizeNextDate(utcSchedule)
2155-
: msg("Invalid date")}</span
2156-
>.`,
2220+
html`Supports Unix cron syntax and certain macros like
2221+
${hourly_macro_code} and ${yearly_macro_code}.`,
21572222
)}
2158-
</p>
2159-
</div>
2160-
`)}
2161-
${this.renderHelpTextCol(
2162-
msg(`A crawl will run at this time in your current timezone.`),
2223+
${this.renderUserGuideLink({
2224+
hash: "cron-schedule",
2225+
content: msg("More details"),
2226+
})}
2227+
`)}
2228+
`,
21632229
)}
21642230
`;
21652231
};
@@ -3017,7 +3083,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
30173083
description: this.formState.description,
30183084
browserWindows: this.formState.browserWindows,
30193085
profileid: this.formState.browserProfile?.id || "",
3020-
schedule: this.formState.scheduleType === "cron" ? this.utcSchedule : "",
3086+
schedule:
3087+
this.formState.scheduleType === "none"
3088+
? ""
3089+
: (this.formState.scheduleFrequency
3090+
? this.utcSchedule
3091+
: this.formState.scheduleCustom) || "",
30213092
crawlTimeout: this.formState.crawlTimeoutMinutes * 60,
30223093
maxCrawlSize: this.formState.maxCrawlSizeGB * BYTES_PER_GB,
30233094
tags: this.formState.tags,

frontend/src/utils/cron.test.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { expect } from "@open-wc/testing";
2+
3+
import { getScheduleInterval, humanizeSchedule } from "./cron";
4+
5+
describe("cron utils", () => {
6+
describe("getScheduleInterval()", () => {
7+
it("handles daily intervals", () => {
8+
expect(getScheduleInterval("1 1 * * *")).to.equal("daily");
9+
});
10+
11+
it("handles weekly intervals", () => {
12+
expect(getScheduleInterval("1 1 * * 1")).to.equal("weekly");
13+
expect(getScheduleInterval("1 1 * * FRI")).to.equal("weekly");
14+
});
15+
16+
it("handles monthly intervals", () => {
17+
expect(getScheduleInterval("1 1 1 * *")).to.equal("monthly");
18+
});
19+
20+
it("returns null if not daily, weekly, or monthly", () => {
21+
// Every minute:
22+
expect(getScheduleInterval("* * * * *")).to.equal(null);
23+
expect(getScheduleInterval("* 1 * * *")).to.equal(null);
24+
expect(getScheduleInterval("* * 1 * *")).to.equal(null);
25+
expect(getScheduleInterval("* * * 1 *")).to.equal(null);
26+
expect(getScheduleInterval("* * * * 1")).to.equal(null);
27+
expect(getScheduleInterval("*/5 * * * *")).to.equal(null);
28+
// Hourly:
29+
expect(getScheduleInterval("1 * * * *")).to.equal(null);
30+
expect(getScheduleInterval("0 */5 * * *")).to.equal(null);
31+
// Yearly:
32+
expect(getScheduleInterval("1 1 1 JAN *")).to.equal(null);
33+
expect(getScheduleInterval("1 1 1 1 *")).to.equal(null);
34+
expect(getScheduleInterval("1 1 1 1 1")).to.equal(null);
35+
});
36+
37+
it("returns null for macros", () => {
38+
expect(getScheduleInterval("@yearly")).to.equal(null);
39+
expect(getScheduleInterval("@monthly")).to.equal(null);
40+
expect(getScheduleInterval("@weekly")).to.equal(null);
41+
expect(getScheduleInterval("@daily")).to.equal(null);
42+
expect(getScheduleInterval("@hourly")).to.equal(null);
43+
});
44+
});
45+
46+
describe("humanizeSchedule()", () => {
47+
it("humanizes daily schedule", () => {
48+
expect(humanizeSchedule("30 1 * * *")).to.equal(
49+
"Every day at 7:30 PM GMT-6",
50+
);
51+
});
52+
53+
it("humanizes weekly schedule", () => {
54+
expect(humanizeSchedule("30 1 * * 1")).to.equal(
55+
"Every Sunday at 8:30 PM GMT-5",
56+
);
57+
});
58+
59+
it("humanizes monthly schedule", () => {
60+
expect(humanizeSchedule("30 1 1 * *")).to.equal(
61+
"On day 30 of the month at 8:30 PM GMT-5",
62+
);
63+
});
64+
65+
it("humanizes schedule without a known interval", () => {
66+
expect(humanizeSchedule("* * * * *")).to.equal("Every minute (UTC)");
67+
expect(humanizeSchedule("30 * 1 * *")).to.equal(
68+
"At 30 minutes past the hour, on day 1 of the month (UTC)",
69+
);
70+
});
71+
});
72+
});

0 commit comments

Comments
 (0)