Skip to content

Commit 0a654a9

Browse files
authored
feat(datafeed): automatically filter data feed articles using Generative AI (#99)
1 parent 3251bc6 commit 0a654a9

File tree

15 files changed

+364
-26
lines changed

15 files changed

+364
-26
lines changed

lib/api/functions/pipeline/updateDataFeed/index.ts

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,17 @@ import {
66
import * as ddb from '@aws-appsync/utils/dynamodb'
77

88
export function request (ctx: Context): DynamoDBUpdateItemRequest {
9-
const { dataFeedId } = ctx.args.input
109
const values: Record<string, unknown> = {}
11-
Object.keys(ctx.args.input as Record<string, unknown>).forEach(
12-
(key: string) => {
13-
if (
14-
ctx.args?.input[key] !== undefined &&
15-
ctx.args?.input[key] !== null &&
16-
key !== 'dataFeedId'
17-
) {
18-
console.log(
19-
`UpdateDataFeed. Loop values: ${key} ---- ${ctx.args.input[key]}`
20-
)
21-
values[key] = ctx.args.input[key]
22-
}
10+
for (const [key, value] of Object.entries(
11+
ctx.args.input as Record<string, unknown>
12+
)) {
13+
if (key !== 'id' && value !== undefined && value !== null) {
14+
values[key] = value
2315
}
24-
)
25-
16+
}
2617
return ddb.update({
2718
key: {
28-
dataFeedId,
19+
dataFeedId: ctx.args.input.id,
2920
sk: 'dataFeed'
3021
},
3122
update: { ...values }
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
*
3+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
* SPDX-License-Identifier: MIT-0
5+
*/
6+
import { Tracer } from '@aws-lambda-powertools/tracer'
7+
import { captureLambdaHandler } from '@aws-lambda-powertools/tracer/middleware'
8+
import { Logger } from '@aws-lambda-powertools/logger'
9+
import { injectLambdaContext } from '@aws-lambda-powertools/logger/middleware'
10+
import middy from '@middy/core'
11+
import { type FeedArticle } from '../../shared/common'
12+
import {
13+
DynamoDBClient,
14+
GetItemCommand,
15+
GetItemCommandInput
16+
} from '@aws-sdk/client-dynamodb'
17+
import axios from 'axios'
18+
import * as cheerio from 'cheerio'
19+
import {
20+
BedrockRuntimeClient,
21+
InvokeModelCommand,
22+
InvokeModelCommandInput
23+
} from '@aws-sdk/client-bedrock-runtime'
24+
25+
const SERVICE_NAME = 'filter-articles-with-bedrock'
26+
27+
const tracer = new Tracer({ serviceName: SERVICE_NAME })
28+
const logger = new Logger({ serviceName: SERVICE_NAME })
29+
30+
const dynamodb = tracer.captureAWSv3Client(new DynamoDBClient())
31+
const bedrockRuntimeClient = tracer.captureAWSv3Client(
32+
new BedrockRuntimeClient()
33+
)
34+
35+
const DATA_FEED_TABLE = process.env.DATA_FEED_TABLE
36+
const BEDROCK_MODEL_ID = 'anthropic.claude-3-haiku-20240307-v1:0'
37+
38+
interface FilterArticlesWithBedrockInput {
39+
dataFeedId: string
40+
articles: FeedArticle[]
41+
}
42+
43+
const lambdaHandler = async (
44+
event: FilterArticlesWithBedrockInput
45+
): Promise<FeedArticle[]> => {
46+
const { dataFeedId, articles } = event
47+
logger.debug('Filtering articles with Bedrock for Data Feed ID ', dataFeedId)
48+
logger.debug('Unfiltered new article count = ', {
49+
articleLength: articles.length
50+
})
51+
const filteredArticles = await filterArticlesWithBedrock(articles, dataFeedId)
52+
logger.debug('Filtered article count = ' + filteredArticles.length)
53+
return filteredArticles
54+
}
55+
56+
const filterArticlesWithBedrock = async (
57+
articles: FeedArticle[],
58+
dataFeedId: string
59+
): Promise<FeedArticle[]> => {
60+
const filteredArticles: FeedArticle[] = []
61+
const filterPrompt = await getFilterPrompt(dataFeedId)
62+
if (filterPrompt === null) {
63+
return articles
64+
}
65+
for (const article of articles) {
66+
logger.debug('Working on article', { article })
67+
const siteContent = await getSiteContent(article.url)
68+
if (siteContent !== null) {
69+
const isFiltered = await isArticleFilteredWithBedrock(
70+
siteContent,
71+
filterPrompt
72+
)
73+
if (!isFiltered) {
74+
console.debug('Article passed filter: ' + article.title)
75+
filteredArticles.push(article)
76+
} else {
77+
console.debug('Article filtered out: ' + article.title)
78+
}
79+
}
80+
}
81+
return filteredArticles
82+
}
83+
84+
const getFilterPrompt = async (dataFeedId: string): Promise<string | null> => {
85+
// Get the filter prompt from dynamoDB using the dataFeedId
86+
logger.debug('Getting filter prompt for data feed ', dataFeedId)
87+
const input: GetItemCommandInput = {
88+
Key: {
89+
dataFeedId: {
90+
S: dataFeedId
91+
},
92+
sk: {
93+
S: 'dataFeed'
94+
}
95+
},
96+
TableName: DATA_FEED_TABLE,
97+
AttributesToGet: ['articleFilterPrompt']
98+
}
99+
const command = new GetItemCommand(input)
100+
const result = await dynamodb.send(command)
101+
if (
102+
result.Item !== undefined &&
103+
result.Item.articleFilterPrompt?.S !== undefined
104+
) {
105+
logger.debug(
106+
'Filter prompt found for data feed ' + result.Item.articleFilterPrompt.S,
107+
dataFeedId
108+
)
109+
return result.Item.articleFilterPrompt.S
110+
} else {
111+
logger.debug('No filter prompt found for data feed ', dataFeedId)
112+
return null
113+
}
114+
}
115+
116+
const isArticleFilteredWithBedrock = async (
117+
articleContent: string,
118+
filterPrompt: string
119+
): Promise<boolean> => {
120+
if (filterPrompt === null) {
121+
return false
122+
}
123+
const prompt =
124+
'You are an agent responsible for reading articles and determining if the article should be filtered out based on the filter prompt.' +
125+
"Is the article filtered out based on the filter prompt? Return either 'true' or 'false'." +
126+
"If the article is filtered out, return 'true', otherwise return 'false'." +
127+
'Here is the article content:\n' +
128+
'<article>' +
129+
articleContent +
130+
'</article>\n' +
131+
'Here is the filter prompt:\n' +
132+
'<filter_prompt>' +
133+
filterPrompt +
134+
'</filter_prompt>' +
135+
"Only return 'true' if the article is filtered out based on the filter prompt. Do not return any other content." +
136+
'Place the response in a <filter_response> xml tag.'
137+
138+
const input: InvokeModelCommandInput = {
139+
modelId: BEDROCK_MODEL_ID,
140+
contentType: 'application/json',
141+
accept: '*/*',
142+
body: new TextEncoder().encode(
143+
JSON.stringify({
144+
max_tokens: 1000,
145+
anthropic_version: 'bedrock-2023-05-31',
146+
messages: [
147+
{
148+
role: 'user',
149+
content: [
150+
{
151+
type: 'text',
152+
text: prompt
153+
}
154+
]
155+
}
156+
]
157+
})
158+
)
159+
}
160+
const command = new InvokeModelCommand(input)
161+
const response = await bedrockRuntimeClient.send(command)
162+
const responseText = new TextDecoder().decode(response.body)
163+
console.debug('Response from Bedrock: ' + responseText)
164+
const responseObject = JSON.parse(responseText)
165+
return extractResponseValue(responseObject.content[0].text, 'filter_response')
166+
}
167+
168+
const getSiteContent = async (url: string): Promise<string | null> => {
169+
logger.debug(`getSiteContent Called; url = ${url}`)
170+
tracer.putMetadata('url', url)
171+
let $: cheerio.Root
172+
try {
173+
logger.debug('URL of Provided Site = ' + url)
174+
const response = await axios.get(url)
175+
tracer.putAnnotation('url', 'Successfully Crawled')
176+
const text = response.data as string
177+
$ = cheerio.load(text)
178+
// Cutting out elements that aren't needed
179+
$('footer').remove()
180+
$('header').remove()
181+
$('script').remove()
182+
$('style').remove()
183+
$('nav').remove()
184+
} catch (error) {
185+
logger.error(`Failed to crawl; url = ${url}`)
186+
logger.error(JSON.stringify(error))
187+
tracer.addErrorAsMetadata(error as Error)
188+
throw error
189+
}
190+
let articleText: string = ''
191+
if ($('article').length > 0) {
192+
articleText = $('article').text()
193+
} else {
194+
articleText = $('body').text()
195+
}
196+
if (articleText !== undefined) {
197+
return articleText
198+
} else {
199+
return null
200+
}
201+
}
202+
203+
const extractResponseValue = (response: string, xml_tag: string): boolean => {
204+
const formattedInput = response
205+
.replace(/(\r\n|\n|\r)/gm, '')
206+
.replace(/\\n/g, '')
207+
const open_tag = `<${xml_tag}>`
208+
const close_tag = `</${xml_tag}>`
209+
const regex = new RegExp(`(?<=${open_tag})(.*?)(?=${close_tag})`, 'g')
210+
const match = formattedInput.match(regex)
211+
const isFiltered = match?.[0].toLocaleLowerCase() === 'true'
212+
return isFiltered
213+
}
214+
215+
export const handler = middy()
216+
.handler(lambdaHandler)
217+
.use(captureLambdaHandler(tracer, { captureResponse: false }))
218+
.use(injectLambdaContext(logger))

lib/data-feed-ingestion/rss-atom-ingestion/ingestion-step-function.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,40 @@ export class IngestionStepFunction extends Construct {
127127
})
128128
)
129129

130+
const filterArticlesWithBedrockFunction = new NodejsFunction(
131+
this,
132+
'filter-articles-with-bedrock',
133+
{
134+
description:
135+
'Function responsible for filtering out using a user provided prompt and Amazon Bedrock.',
136+
handler: 'handler',
137+
entry: new URL(
138+
import.meta.url.replace(
139+
/(.*)(\..+)/,
140+
'$1.' + 'filter-articles-with-bedrock' + '$2'
141+
)
142+
).pathname,
143+
runtime: Runtime.NODEJS_20_X,
144+
architecture: Architecture.ARM_64,
145+
tracing: Tracing.ACTIVE,
146+
loggingFormat: LoggingFormat.JSON,
147+
applicationLogLevelV2: ApplicationLogLevel.DEBUG,
148+
insightsVersion: LambdaInsightsVersion.VERSION_1_0_229_0,
149+
timeout: cdk.Duration.minutes(5),
150+
environment: {
151+
DATA_FEED_TABLE: dataFeedTable.tableName
152+
}
153+
}
154+
)
155+
dataFeedTable.grantReadData(filterArticlesWithBedrockFunction)
156+
filterArticlesWithBedrockFunction.addToRolePolicy(
157+
new PolicyStatement({
158+
actions: ['bedrock:InvokeModel'],
159+
resources: ['*'],
160+
effect: Effect.ALLOW
161+
})
162+
)
163+
130164
const getDataFeedDetailsJob = new DynamoGetItem(
131165
this,
132166
'GetDataFeedDetailsJob',
@@ -182,6 +216,23 @@ export class IngestionStepFunction extends Construct {
182216
payload: TaskInput.fromJsonPathAt('$')
183217
})
184218

219+
const filterArticlesWithBedrockJob = new LambdaInvoke(
220+
this,
221+
'FilterArticlesWithBedrock',
222+
{
223+
lambdaFunction: filterArticlesWithBedrockFunction,
224+
inputPath: JsonPath.stringAt('$'),
225+
payload: TaskInput.fromObject({
226+
dataFeedId: JsonPath.stringAt('$.dataFeedId'),
227+
articles: JsonPath.objectAt('$.articlesData.articles')
228+
}),
229+
resultSelector: {
230+
'articles.$': '$.Payload'
231+
},
232+
resultPath: '$.articlesData'
233+
}
234+
)
235+
185236
const mapArticles = new Map(this, 'MapArticles', {
186237
itemsPath: '$.articlesData.articles',
187238
itemSelector: {
@@ -197,6 +248,7 @@ export class IngestionStepFunction extends Construct {
197248
const definition = getDataFeedDetailsJob
198249
.next(readFeedJob)
199250
.next(filterIngestedArticlesJob)
251+
.next(filterArticlesWithBedrockJob)
200252
.next(mapArticles)
201253

202254
const stateMachine = new StateMachine(this, 'IngestionStateMachine', {
@@ -217,6 +269,7 @@ export class IngestionStepFunction extends Construct {
217269
feedReaderFunction.grantInvoke(stateMachine)
218270
filterIngestedArticlesFunction.grantInvoke(stateMachine)
219271
articleIngestionFunction.grantInvoke(stateMachine)
272+
filterArticlesWithBedrockFunction.grantInvoke(stateMachine)
220273
props.dataFeedTable.grantWriteData(articleIngestionFunction)
221274
props.rssAtomDataBucket.grantPut(stateMachine)
222275
this.stateMachine = stateMachine
@@ -229,6 +282,7 @@ export class IngestionStepFunction extends Construct {
229282
feedReaderFunction,
230283
articleIngestionFunction,
231284
filterIngestedArticlesFunction,
285+
filterArticlesWithBedrockFunction,
232286
stateMachine
233287
],
234288
[

0 commit comments

Comments
 (0)