diff --git a/.eslintrc.js b/.eslintrc.js new file mode 100644 index 0000000..47b00d4 --- /dev/null +++ b/.eslintrc.js @@ -0,0 +1,12 @@ +module.exports = { + "extends": [ + "react-app", + "prettier/@typescript-eslint", + "plugin:prettier/recommended" + ], + "settings": { + "react": { + "version": "999.999.999" + } + } +} \ No newline at end of file diff --git a/package.json b/package.json index 02cc843..2931983 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "module": "dist/pulse-feed-parser.esm.js", "devDependencies": { "@size-limit/preset-small-lib": "^4.5.5", + "eslint-plugin-prettier": "^3.1.4", "husky": "^4.2.5", "prettier": "^2.0.5", "size-limit": "^4.5.5", diff --git a/src/Adapter/AtomFeedAdapter.ts b/src/Adapter/AtomFeedAdapter.ts index 9e8e548..922ba56 100644 --- a/src/Adapter/AtomFeedAdapter.ts +++ b/src/Adapter/AtomFeedAdapter.ts @@ -1,5 +1,12 @@ -import { Enclosure, Feed, Image, Item, Person } from '../types/Feed'; -import { AtomEntry, AtomFeed } from '../types/Atom'; +import { + Enclosure, + Feed, + Image, + Item, + Person, + AtomEntry, + AtomFeed, +} from '../types'; import { parsePerson } from '../utils/parsePerson'; // DefaultAtomTranslator converts an atom.Feed struct diff --git a/src/Adapter/RSSFeedAdapter.ts b/src/Adapter/RSSFeedAdapter.ts index e8a2b0a..514fd7a 100644 --- a/src/Adapter/RSSFeedAdapter.ts +++ b/src/Adapter/RSSFeedAdapter.ts @@ -1,5 +1,12 @@ -import { RSSFeed, RSSItem } from '../types/RSS'; -import { Enclosure, Feed, Image, Item, Person } from '../types/Feed'; +import { + Enclosure, + Feed, + Image, + Item, + Person, + RSSFeed, + RSSItem, +} from '../types'; import { parsePerson } from '../utils/parsePerson'; export class RSSFeedAdapter { diff --git a/src/Extensions/DublinCoreExtension.ts b/src/Extensions/DublinCoreExtension.ts index 6574a9a..0942c5a 100644 --- a/src/Extensions/DublinCoreExtension.ts +++ b/src/Extensions/DublinCoreExtension.ts @@ -1,18 +1,18 @@ -type DCExtension = { - title: Maybe - creator: Maybe - author: Maybe - subject: Maybe - description: Maybe - publisher: Maybe - contributor: Maybe - date: Maybe - type: Maybe - format: Maybe - identifier: Maybe - source: Maybe - language: Maybe - relation: Maybe - coverage: Maybe - rights: Maybe -} +export type DCExtension = { + title: Maybe; + creator: Maybe; + author: Maybe; + subject: Maybe; + description: Maybe; + publisher: Maybe; + contributor: Maybe; + date: Maybe; + type: Maybe; + format: Maybe; + identifier: Maybe; + source: Maybe; + language: Maybe; + relation: Maybe; + coverage: Maybe; + rights: Maybe; +}; diff --git a/src/Parser.ts b/src/Parser.ts index fc74f0a..9725761 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -1,4 +1,4 @@ -import { Feed } from './types/Feed'; +import { Feed } from './types'; import { FeedType, XmlFeedTypeDetector } from './XmlFeedTypeDetector'; import { RSSParser } from './Parsers/RSSParser'; import { AtomParser } from './Parsers/AtomParser'; @@ -6,28 +6,50 @@ import { RSSFeedAdapter } from './Adapter/RSSFeedAdapter'; import { AtomFeedAdapter } from './Adapter/AtomFeedAdapter'; import { NetworkError } from './Errors/NetworkError'; import { FeedTypeError } from './Errors/FeedTypeError'; +// @ts-ignore +import { version } from '../package.json'; -export const DEFAULT_FETCH_HEADERS = { - 'User-Agent': 'PulseRSS/1.0', +export type PFPOptions = { + /** + * Enables HTML content sanitization. + * Default sanitization rules will strip unwanted tags, attributes, comments + * and empty paragraphs. You can change this behavior with a function. + */ + sanitization?: boolean; + /** + * Options that will be passed to fetch() while parsing feeds from URLs. + * Default options contain a User-Agent string specific to PFP. + */ + fetchOptions?: RequestInit; +}; + +const DEFAULT_OPTIONS = { + sanitization: true, + fetchOptions: { + headers: { + 'User-Agent': `pulse-feed-parser/${version}`, + }, + }, }; /** - * Parser Factory + * Pulse Feed Parser Factory */ export class Parser { - fetchOptions: RequestInit; + options: PFPOptions; - constructor( - { fetchOptions }: { fetchOptions: RequestInit } = { fetchOptions: {} } - ) { - this.fetchOptions = fetchOptions; + /** + * Changed options will be merged with the defaults. + */ + constructor(options?: PFPOptions) { + this.options = options ? mergeOptions(options) : DEFAULT_OPTIONS; } + /** + * Try to parse a feed from the given URL. + */ public async parseURL(url: string): Promise { - const response = await fetch(url, { - ...this.fetchOptions, - headers: { ...DEFAULT_FETCH_HEADERS, ...this.fetchOptions?.headers }, - }); + const response = await fetch(url, this.options.fetchOptions); if (response.status < 200 || response.status >= 300) { throw new NetworkError(`The feed is unreachable`, response.status); @@ -41,17 +63,35 @@ export class Parser { return this.parseDocument(doc); } + /** + * Parse a feed from the given XML document. + */ public parseDocument(doc: Document): Feed { + const { sanitization } = this.options; const type = XmlFeedTypeDetector.detect(doc); if (type === FeedType.RSS) { - return RSSFeedAdapter.adapt(new RSSParser(doc).parse()); + return RSSFeedAdapter.adapt(new RSSParser({ sanitization }).parse(doc)); } if (type === FeedType.Atom) { - return AtomFeedAdapter.adapt(new AtomParser(doc).parse()); + return AtomFeedAdapter.adapt(new AtomParser({ sanitization }).parse(doc)); } throw new FeedTypeError('Unknown feed type'); } } + +/** + * Merge provided options with the defaults. + */ +const mergeOptions = (options: PFPOptions) => ({ + ...options, + fetchOptions: { + ...options.fetchOptions, + headers: { + ...DEFAULT_OPTIONS.fetchOptions.headers, + ...options.fetchOptions?.headers, + }, + }, +}); diff --git a/src/Parsers/AtomParser.ts b/src/Parsers/AtomParser.ts index 9cc189d..e651ef5 100644 --- a/src/Parsers/AtomParser.ts +++ b/src/Parsers/AtomParser.ts @@ -7,12 +7,14 @@ import { AtomLink, AtomPerson, AtomSource, -} from '../types/Atom'; + IParser, +} from '../types'; import { getExtensionName, isExtension, parseExtension, } from '../utils/extensions'; +import { BaseParser, ParserOptions } from './BaseParser'; // Atom elements which contain URIs // https://tools.ietf.org/html/rfc4287 @@ -36,17 +38,14 @@ import { /** * Parser for Atom feeds */ -export class AtomParser { +export class AtomParser extends BaseParser implements IParser { private readonly entry: AtomEntry; private readonly source: AtomSource; private readonly person: AtomPerson; private feed: AtomFeed; - private document: Document; - // private baseURL: Maybe; - constructor(document: Document) { - this.document = document; - // this.baseURL = null; + constructor(options?: ParserOptions) { + super(options); this.feed = { id: null, @@ -100,19 +99,14 @@ export class AtomParser { this.person = { email: null, name: null, uri: null }; } - public parse(): AtomFeed { - const root = this.document.firstElementChild; + public parse(doc: Document): AtomFeed { + const root = doc.firstElementChild; if (root === null) { throw new Error('No root node'); } - // this.baseURL = root.getAttributeNS('xml', 'base'); - - const walker = window.document.createTreeWalker( - root, - NodeFilter.SHOW_ELEMENT - ); + const walker = doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); walker.firstChild(); this.parseRoot(walker); @@ -141,7 +135,8 @@ export class AtomParser { ...this.feed.extensions[ext][prop], extension, ]; - } if (tagName === 'title') { + } + if (tagName === 'title') { this.feed.title = this.parseText(walker.currentNode as Element); } else if (tagName === 'id') { this.feed.id = this.parseText(walker.currentNode as Element); @@ -217,7 +212,10 @@ export class AtomParser { entry.extensions[ext][prop] = []; } - entry.extensions[ext][prop] = [...entry.extensions[ext][prop], extension]; + entry.extensions[ext][prop] = [ + ...entry.extensions[ext][prop], + extension, + ]; } else if (tagName === 'title') { entry.title = this.parseText(walker.currentNode as Element); } else if (tagName === 'id') { @@ -395,7 +393,7 @@ export class AtomParser { // If type="xhtml", then this element contains inline xhtml, wrapped in a div element. if (type === 'xhtml') { - return node.firstElementChild!.textContent!.trim() + return node.firstElementChild!.textContent!.trim(); } return null; diff --git a/src/Parsers/BaseParser.ts b/src/Parsers/BaseParser.ts new file mode 100644 index 0000000..96aff80 --- /dev/null +++ b/src/Parsers/BaseParser.ts @@ -0,0 +1,15 @@ +export type ParserOptions = { + sanitization?: boolean; +}; + +const DEFAULT_OPTIONS = { + sanitization: true, +}; + +export abstract class BaseParser { + protected options: ParserOptions; + + protected constructor(options?: ParserOptions) { + this.options = { ...DEFAULT_OPTIONS, ...options }; + } +} diff --git a/src/Parsers/JSONParser.ts b/src/Parsers/JSONParser.ts new file mode 100644 index 0000000..3d00301 --- /dev/null +++ b/src/Parsers/JSONParser.ts @@ -0,0 +1,12 @@ +// @ts-nocheck +import { BaseParser, ParserOptions } from './BaseParser'; +import { IParser } from '../types'; + +/** + * @todo + */ +export class JSONParser extends BaseParser implements IParser { + constructor(options: ParserOptions) { + super(options); + } +} diff --git a/src/Parsers/JsonParser.ts b/src/Parsers/JsonParser.ts deleted file mode 100644 index c362b3a..0000000 --- a/src/Parsers/JsonParser.ts +++ /dev/null @@ -1,10 +0,0 @@ -/** - * @todo - */ -export class JsonParser { - content: JSON; - - constructor(content: JSON) { - this.content = content; - } -} diff --git a/src/Parsers/RSSParser.ts b/src/Parsers/RSSParser.ts index 636c605..aa34b24 100644 --- a/src/Parsers/RSSParser.ts +++ b/src/Parsers/RSSParser.ts @@ -8,27 +8,30 @@ import { RSSSource, RSSCloud, RSSTextInput, -} from '../types/RSS'; + IParser, +} from '../types'; + import { append } from '../utils/collection'; import { getExtensionName, isExtension, parseExtension, } from '../utils/extensions'; +import { sanitize } from '../utils/sanitizer'; +import { BaseParser, ParserOptions } from './BaseParser'; /** * Parser for RSS feeds */ -export class RSSParser { +export class RSSParser extends BaseParser implements IParser { feed: RSSFeed; private readonly image: RSSImage; private readonly item: RSSItem; private readonly guid: RSSGUID; private readonly textInput: RSSTextInput; - private document: Document; - constructor(document: Document) { - this.document = document; + constructor(options?: ParserOptions) { + super(options); this.image = { description: null, @@ -88,17 +91,14 @@ export class RSSParser { }; } - public parse(): RSSFeed { - const root = this.document.firstElementChild; + public parse(doc: Document): RSSFeed { + const root = doc.firstElementChild; if (root === null) { throw new Error('No root node'); } - const walker = window.document.createTreeWalker( - root, - NodeFilter.SHOW_ELEMENT - ); + const walker = doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); walker.firstChild(); do { @@ -244,7 +244,7 @@ export class RSSParser { } else if (tagName === 'pubdate') { item.pubDate = this.parseText(walker.currentNode as Element); } else if (tagName === 'content:encoded') { - item.content = this.parseText(walker.currentNode as Element); + item.content = this.parseHTML(walker.currentNode as Element); } else if (tagName === 'source') { item.source = this.parseSource(walker.currentNode as Element); } else if (tagName === 'enclosure') { @@ -336,4 +336,13 @@ export class RSSParser { return null; } + + private parseHTML(node: Node): Maybe { + const text = this.parseText(node); + + if (text === null) return null; + + const doc = new DOMParser().parseFromString(text, 'text/html'); + return sanitize(doc); + } } diff --git a/src/index.ts b/src/index.ts index 965900f..b5ca2da 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,6 +4,6 @@ export * from './Parsers/RSSParser'; export * from './Parsers/AtomParser'; export * from './Adapter/AtomFeedAdapter'; export * from './Adapter/RSSFeedAdapter'; -export * from './Errors/FeedTypeError' -export * from './Errors/NetworkError' +export * from './Errors/FeedTypeError'; +export * from './Errors/NetworkError'; export * from './types'; diff --git a/src/types/Atom.ts b/src/types/Atom.ts index f06f4b8..f20c02a 100644 --- a/src/types/Atom.ts +++ b/src/types/Atom.ts @@ -16,7 +16,7 @@ export type AtomFeed = { authors: Maybe>; categories: Maybe>; entries: Maybe>; - extensions: Maybe + extensions: Maybe; }; // Entry is an Atom Entry @@ -33,7 +33,7 @@ export type AtomEntry = { published: Maybe; source: Maybe; content: Maybe; - extensions: Maybe + extensions: Maybe; }; // Category is category metadata for Feeds and Entries diff --git a/src/types/Extension.ts b/src/types/Extension.ts index 52eb3ca..40b1153 100644 --- a/src/types/Extension.ts +++ b/src/types/Extension.ts @@ -1,10 +1,10 @@ export type Extension = { name: string; value: Maybe; - attrs: Maybe - children: Maybe> -} + attrs: Maybe; + children: Maybe>; +}; export type Attrs = { - [key: string]: string -} + [key: string]: string; +}; diff --git a/src/types/IParser.ts b/src/types/IParser.ts new file mode 100644 index 0000000..15c4557 --- /dev/null +++ b/src/types/IParser.ts @@ -0,0 +1,3 @@ +export interface IParser { + parse(doc: Document): any; +} diff --git a/src/types/RSS.ts b/src/types/RSS.ts index 01fc4b3..3c5beef 100644 --- a/src/types/RSS.ts +++ b/src/types/RSS.ts @@ -22,7 +22,7 @@ export type RSSFeed = { cloud: Maybe; textInput: Maybe; items: Maybe>; - extensions: Maybe + extensions: Maybe; }; // Item is an RSS Item @@ -38,7 +38,7 @@ export type RSSItem = { guid: Maybe; pubDate: Maybe; source: Maybe; - extensions: Maybe + extensions: Maybe; }; // Image is an image that represents the feed diff --git a/src/types/index.ts b/src/types/index.ts index 0b43b4b..2fa0549 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -2,3 +2,4 @@ export * from './Atom'; export * from './Extension'; export * from './Feed'; export * from './RSS'; +export * from './IParser'; diff --git a/src/utils/extensions.ts b/src/utils/extensions.ts index e586c3e..0bd1342 100644 --- a/src/utils/extensions.ts +++ b/src/utils/extensions.ts @@ -1,4 +1,4 @@ -import { Extension } from '../types/Extension'; +import { Extension } from '../types'; const EXTENSION: Extension = { name: '', @@ -26,13 +26,16 @@ export const getExtensionName = (node: Element): string => node.prefix!; */ export const parseExtension = (node: Element): [string, Extension] => { const ext = { ...EXTENSION }; - const firstChildName = node.firstChild?.nodeName; + const firstChild = node.firstChild; + const isTextNode = + node.firstChild?.nodeType === Node.COMMENT_NODE || + node.firstChild?.nodeType === Node.CDATA_SECTION_NODE; + ext.name = node.nodeName.toLowerCase(); - const isTextNode = ['#text', '#cdata-section'].includes(firstChildName!); if (isTextNode) { ext.value = node.textContent!.trim(); - } else if (firstChildName !== undefined) { + } else if (firstChild !== null) { // child will be undefined in case of self-closing node // like ext.children = Array.from(node.childNodes).map(node => { diff --git a/src/utils/parsePerson.ts b/src/utils/parsePerson.ts index eaf568d..05cc2b1 100644 --- a/src/utils/parsePerson.ts +++ b/src/utils/parsePerson.ts @@ -1,9 +1,9 @@ -import { Person } from '../types/Feed'; +import { Person } from '../types'; -const emailNameRgx = new RegExp(`^([^@]+@[^\s]+)\s+\(([^@]+)\)$`); -const nameEmailRgx = new RegExp(`^([^@]+)\s+\(([^@]+@[^)]+)\)$`); -const nameOnlyRgx = new RegExp(`^([^@()]+)$`); -const emailOnlyRgx = new RegExp(`^([^@()]+@[^@()]+)$`); +const emailNameRgx = new RegExp(/^([^@]+@[^\s]+)\s+\(([^@]+)\)$/); +const nameEmailRgx = new RegExp(/^([^@]+)\s+\(([^@]+@[^)]+)\)$/); +const nameOnlyRgx = new RegExp(/^([^@()]+)$/); +const emailOnlyRgx = new RegExp(/^([^@()]+@[^@()]+)$/); // ParseNameAddress parses name/email strings commonly // found in RSS feeds of the format "Example Name (example@site.com)" diff --git a/src/utils/sanitizer.ts b/src/utils/sanitizer.ts new file mode 100644 index 0000000..190ff0f --- /dev/null +++ b/src/utils/sanitizer.ts @@ -0,0 +1,376 @@ +const ACCEPTABLE_ELEMENTS = new Set([ + 'a', + 'abbr', + 'acronym', + 'address', + 'area', + 'article', + 'aside', + 'audio', + 'b', + 'big', + 'blockquote', + 'br', + 'button', + 'canvas', + 'caption', + 'center', + 'cite', + 'code', + 'col', + 'colgroup', + 'command', + 'datagrid', + 'datalist', + 'dd', + 'del', + 'details', + 'dfn', + 'dialog', + 'dir', + 'div', + 'dl', + 'dt', + 'em', + 'event-source', + 'fieldset', + 'figcaption', + 'figure', + 'font', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hr', + 'i', + 'img', + 'input', + 'ins', + 'kbd', + 'keygen', + 'label', + 'legend', + 'li', + 'm', + 'map', + 'menu', + 'meter', + 'multicol', + 'nav', + 'nextid', + 'noscript', + 'ol', + 'optgroup', + 'option', + 'output', + 'p', + 'pre', + 'progress', + 'q', + 's', + 'samp', + 'section', + 'select', + 'small', + 'sound', + 'source', + 'spacer', + 'span', + 'strike', + 'strong', + 'sub', + 'sup', + 'svg', + 'table', + 'tbody', + 'td', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'time', + 'tr', + 'tt', + 'u', + 'ul', + 'var', + 'video', + 'iframe', +]); + +const ACCEPTABLE_ATTRIBUTES = new Set([ + 'abbr', + 'align', + 'alt', + 'autocomplete', + 'autofocus', + 'cellpadding', + 'cellspacing', + 'cite', + 'colspan', + 'compact', + 'disabled', + 'height', + 'href', + 'hreflang', + 'label', + 'rows', + 'rowspan', + 'span', + 'src', + 'target', + 'title', + 'width', +]); + +const ACCEPTABLE_EMPTY_ELEMENTS = new Set([ + 'img', + 'video', + 'audio', + 'hr', + 'br', + 'canvas', + 'input', + 'area', + 'iframe', +]); + +const MATHML_ELEMENTS = new Set([ + 'annotation', + 'annotation-xml', + 'maction', + 'maligngroup', + 'malignmark', + 'math', + 'menclose', + 'merror', + 'mfenced', + 'mfrac', + 'mglyph', + 'mi', + 'mlabeledtr', + 'mlongdiv', + 'mmultiscripts', + 'mn', + 'mo', + 'mover', + 'mpadded', + 'mphantom', + 'mprescripts', + 'mroot', + 'mrow', + 'ms', + 'mscarries', + 'mscarry', + 'msgroup', + 'msline', + 'mspace', + 'msqrt', + 'msrow', + 'mstack', + 'mstyle', + 'msub', + 'msubsup', + 'msup', + 'mtable', + 'mtd', + 'mtext', + 'mtr', + 'munder', + 'munderover', + 'none', + 'semantics', +]); + +const MATHML_ATTRIBUTES = new Set([ + 'accent', + 'accentunder', + 'actiontype', + 'align', + 'alignmentscope', + 'altimg', + 'altimg-height', + 'altimg-valign', + 'altimg-width', + 'alttext', + 'bevelled', + 'charalign', + 'close', + 'columnalign', + 'columnlines', + 'columnspacing', + 'columnspan', + 'columnwidth', + 'crossout', + 'decimalpoint', + 'denomalign', + 'depth', + 'dir', + 'display', + 'displaystyle', + 'edge', + 'encoding', + 'equalcolumns', + 'equalrows', + 'fence', + 'fontstyle', + 'fontweight', + 'form', + 'frame', + 'framespacing', + 'groupalign', + 'height', + 'href', + 'id', + 'indentalign', + 'indentalignfirst', + 'indentalignlast', + 'indentshift', + 'indentshiftfirst', + 'indentshiftlast', + 'indenttarget', + 'infixlinebreakstyle', + 'largeop', + 'length', + 'linebreak', + 'linebreakmultchar', + 'linebreakstyle', + 'lineleading', + 'linethickness', + 'location', + 'longdivstyle', + 'lquote', + 'lspace', + 'mathbackground', + 'mathcolor', + 'mathsize', + 'mathvariant', + 'maxsize', + 'minlabelspacing', + 'minsize', + 'movablelimits', + 'notation', + 'numalign', + 'open', + 'other', + 'overflow', + 'position', + 'rowalign', + 'rowlines', + 'rowspacing', + 'rowspan', + 'rquote', + 'rspace', + 'scriptlevel', + 'scriptminsize', + 'scriptsizemultiplier', + 'selection', + 'separator', + 'separators', + 'shift', + 'side', + 'src', + 'stackalign', + 'stretchy', + 'subscriptshift', + 'superscriptshift', + 'symmetric', + 'voffset', + 'width', + 'xlink:href', + 'xlink:show', + 'xlink:type', + 'xmlns', + 'xmlns:xlink', +]); + +const ALL_ACCEPTABLE_ELEMENTS = new Set([ + ...ACCEPTABLE_ELEMENTS, + ...MATHML_ELEMENTS, +]); + +/** + * Clear the given DOM three from unwanted elements and attributes. + */ +export const sanitize = (doc: Document): string => { + const walker = doc.createTreeWalker( + doc.body, + NodeFilter.SHOW_ELEMENT + NodeFilter.SHOW_COMMENT + ); + + while (walker.nextNode()) { + const current = walker.currentNode as Element; + const nodeName = current.nodeName.toLowerCase(); + + // Strip HTML comments + // Strip unacceptable elements + if ( + current.nodeType === Node.COMMENT_NODE || + !ALL_ACCEPTABLE_ELEMENTS.has(nodeName) + ) { + removeNodeFromDocument(walker, current); + continue; + } + + // Remove redundant empty elements + if ( + current.childNodes.length === 0 && + !ACCEPTABLE_EMPTY_ELEMENTS.has(nodeName) + ) { + removeNodeFromDocument(walker, current); + continue; + } + + // Skip SVG checking + if (nodeName === 'svg') { + skipNodeChecking(walker, current); + continue; + } + + // Clear common elements' attributes + if (ACCEPTABLE_ELEMENTS.has(nodeName)) { + current.getAttributeNames().forEach(attribute => { + if (!ACCEPTABLE_ATTRIBUTES.has(attribute)) { + current.removeAttribute(attribute); + } + }); + // Clean MATHML elements' attributes + } else if (MATHML_ELEMENTS.has(nodeName)) { + current.getAttributeNames().forEach(attribute => { + if (!MATHML_ATTRIBUTES.has(attribute)) { + current.removeAttribute(attribute); + } + }); + } + } + + return doc.body.innerHTML; +}; + +/** + * Helper that remove the node from the document and sets a currentNode + * of a walker object back to parent to continue walking. + */ +const removeNodeFromDocument = (walker: TreeWalker, node: Element) => { + const parent = node.parentNode!; + parent.removeChild(node); + + // Set currentNode to parent to prevent breaking of the walk + walker.currentNode = parent; +}; + +/** + * Sets currentNode to next available + */ +const skipNodeChecking = (walker: TreeWalker, node: Element) => { + const nextSibling = node.nextSibling; + + if (nextSibling !== null) { + walker.currentNode = nextSibling; + } +}; diff --git a/test/AtomParser.test.ts b/test/AtomParser.test.ts index 115d239..8a8385f 100644 --- a/test/AtomParser.test.ts +++ b/test/AtomParser.test.ts @@ -1,6 +1,6 @@ import fs from 'fs'; import path from 'path'; -import { AtomParser } from '../src/Parsers/AtomParser'; +import { AtomParser } from '../src'; const feedPaths = [ path.join(__dirname, './stubs/atom/gitlab.xml'), diff --git a/test/RssParser.test.ts b/test/RssParser.test.ts index 61f0765..7d06367 100644 --- a/test/RssParser.test.ts +++ b/test/RssParser.test.ts @@ -1,6 +1,6 @@ import fs from 'fs'; import path from 'path'; -import { RSSParser } from '../src/Parsers/RSSParser'; +import { RSSParser } from '../src'; const feedPaths = [ path.join(__dirname, './stubs/rss/github.xml'), @@ -21,9 +21,9 @@ it('should parse RSS feeds', () => { feedPaths.forEach(p => { const xml = fs.readFileSync(p, { encoding: 'utf8' }); const doc = new DOMParser().parseFromString(xml, 'application/xml'); - const parser = new RSSParser(doc); + const parser = new RSSParser(); - expect(() => parser.parse()).not.toThrowError(); + expect(() => parser.parse(doc)).not.toThrowError(); }); }); @@ -31,8 +31,8 @@ it('should parse canonical feed', () => { const canonical = fs.readFileSync(canonicalFeedPath, { encoding: 'utf8' }); const expected = fs.readFileSync(canonicalExpectation, { encoding: 'utf8' }); const doc = new DOMParser().parseFromString(canonical, 'application/xml'); - const parser = new RSSParser(doc); - const data = parser.parse(); + const parser = new RSSParser(); + const data = parser.parse(doc); expect(data).toEqual(JSON.parse(expected)); }); diff --git a/test/helpers.ts b/test/helpers.ts new file mode 100644 index 0000000..58fcb44 --- /dev/null +++ b/test/helpers.ts @@ -0,0 +1,23 @@ +import fs from 'fs'; +import path from 'path'; + +export const getStub = (basePath: string) => (stub: string) => { + return fs.readFileSync( + path.resolve(__dirname, `./stubs/${basePath}/${stub}`), + { encoding: 'utf8' } + ); +}; + +export const saveSnapshot = (basePath: string) => ( + stub: string, + content: string +) => { + const segments = stub.split('.'); + segments.splice(-1, 0, 'result'); + const fileName = segments.join('.'); + + fs.writeFileSync( + path.resolve(__dirname, `./stubs/${basePath}/${fileName}`), + content + ); +}; diff --git a/test/sanitizer.test.ts b/test/sanitizer.test.ts new file mode 100644 index 0000000..b27d07f --- /dev/null +++ b/test/sanitizer.test.ts @@ -0,0 +1,77 @@ +import { getStub } from './helpers'; +import { sanitize } from '../src/utils/sanitizer'; + +const getStubContent = getStub('sanitizer'); + +describe('Content sanitization', () => { + it('should remove unacceptable empty elements', () => { + const doc = new DOMParser().parseFromString( + getStubContent('empty-elements.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual(getStubContent('empty-elements.result.html')); + }); + + it('should preserve allowed empty elements', () => { + const doc = new DOMParser().parseFromString( + getStubContent('allowed-empty-elements.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual( + getStubContent('allowed-empty-elements.result.html') + ); + }); + + it('should remove restricted attributes', () => { + const doc = new DOMParser().parseFromString( + getStubContent('restricted-attributes.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual( + getStubContent('restricted-attributes.result.html') + ); + }); + + it('should preserve allowed attributes', () => { + const doc = new DOMParser().parseFromString( + getStubContent('allowed-attributes.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual( + getStubContent('allowed-attributes.result.html') + ); + }); + + it('should remove restricted elements', () => { + const doc = new DOMParser().parseFromString( + getStubContent('restricted-elements.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual( + getStubContent('restricted-elements.result.html') + ); + }); + + it('should remove html comments', () => { + const doc = new DOMParser().parseFromString( + getStubContent('html-comments.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual(getStubContent('html-comments.result.html')); + }); + + it('should preserve svg elements', () => { + const doc = new DOMParser().parseFromString( + getStubContent('svg.html'), + 'text/html' + ); + + expect(sanitize(doc)).toEqual(getStubContent('svg.result.html')); + }); +}); diff --git a/test/stubs/sanitizer/allowed-attributes.html b/test/stubs/sanitizer/allowed-attributes.html new file mode 100644 index 0000000..24dea86 --- /dev/null +++ b/test/stubs/sanitizer/allowed-attributes.html @@ -0,0 +1,3 @@ +Alt text +test + diff --git a/test/stubs/sanitizer/allowed-attributes.result.html b/test/stubs/sanitizer/allowed-attributes.result.html new file mode 100644 index 0000000..24dea86 --- /dev/null +++ b/test/stubs/sanitizer/allowed-attributes.result.html @@ -0,0 +1,3 @@ +Alt text +test + diff --git a/test/stubs/sanitizer/allowed-empty-elements.html b/test/stubs/sanitizer/allowed-empty-elements.html new file mode 100644 index 0000000..ad90a16 --- /dev/null +++ b/test/stubs/sanitizer/allowed-empty-elements.html @@ -0,0 +1,9 @@ + + + + +
+
+ + + diff --git a/test/stubs/sanitizer/allowed-empty-elements.result.html b/test/stubs/sanitizer/allowed-empty-elements.result.html new file mode 100644 index 0000000..ad90a16 --- /dev/null +++ b/test/stubs/sanitizer/allowed-empty-elements.result.html @@ -0,0 +1,9 @@ + + + + +
+
+ + + diff --git a/test/stubs/sanitizer/empty-elements.html b/test/stubs/sanitizer/empty-elements.html new file mode 100644 index 0000000..88d7f0d --- /dev/null +++ b/test/stubs/sanitizer/empty-elements.html @@ -0,0 +1,4 @@ +

+
+

Hello

+ diff --git a/test/stubs/sanitizer/empty-elements.result.html b/test/stubs/sanitizer/empty-elements.result.html new file mode 100644 index 0000000..a89756d --- /dev/null +++ b/test/stubs/sanitizer/empty-elements.result.html @@ -0,0 +1,4 @@ + + +

Hello

+ diff --git a/test/stubs/sanitizer/html-comments.html b/test/stubs/sanitizer/html-comments.html new file mode 100644 index 0000000..679c0d8 --- /dev/null +++ b/test/stubs/sanitizer/html-comments.html @@ -0,0 +1,6 @@ +

+ + Hello, World! + +

+ diff --git a/test/stubs/sanitizer/html-comments.result.html b/test/stubs/sanitizer/html-comments.result.html new file mode 100644 index 0000000..a286a60 --- /dev/null +++ b/test/stubs/sanitizer/html-comments.result.html @@ -0,0 +1,6 @@ +

+ + Hello, World! + +

+ diff --git a/test/stubs/sanitizer/restricted-attributes.html b/test/stubs/sanitizer/restricted-attributes.html new file mode 100644 index 0000000..9a0d429 --- /dev/null +++ b/test/stubs/sanitizer/restricted-attributes.html @@ -0,0 +1,3 @@ +

class attribute

+
style attribute
+

js attributes

diff --git a/test/stubs/sanitizer/restricted-attributes.result.html b/test/stubs/sanitizer/restricted-attributes.result.html new file mode 100644 index 0000000..4964abd --- /dev/null +++ b/test/stubs/sanitizer/restricted-attributes.result.html @@ -0,0 +1,3 @@ +

class attribute

+
style attribute
+

js attributes

diff --git a/test/stubs/sanitizer/restricted-elements.html b/test/stubs/sanitizer/restricted-elements.html new file mode 100644 index 0000000..7504e12 --- /dev/null +++ b/test/stubs/sanitizer/restricted-elements.html @@ -0,0 +1,5 @@ + + + + +text diff --git a/test/stubs/sanitizer/restricted-elements.result.html b/test/stubs/sanitizer/restricted-elements.result.html new file mode 100644 index 0000000..139597f --- /dev/null +++ b/test/stubs/sanitizer/restricted-elements.result.html @@ -0,0 +1,2 @@ + + diff --git a/test/stubs/sanitizer/svg.html b/test/stubs/sanitizer/svg.html new file mode 100644 index 0000000..2433f8e --- /dev/null +++ b/test/stubs/sanitizer/svg.html @@ -0,0 +1,7 @@ + + + + I love SVG! + + Sorry, your browser does not support inline SVG. + diff --git a/test/stubs/sanitizer/svg.result.html b/test/stubs/sanitizer/svg.result.html new file mode 100644 index 0000000..2433f8e --- /dev/null +++ b/test/stubs/sanitizer/svg.result.html @@ -0,0 +1,7 @@ + + + + I love SVG! + + Sorry, your browser does not support inline SVG. + diff --git a/tsconfig.json b/tsconfig.json index 60f11f2..0665fca 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,8 +1,14 @@ { - "include": ["src", "types"], + "include": [ + "src", + "types" + ], "compilerOptions": { "module": "esnext", - "lib": ["dom", "esnext"], + "lib": [ + "dom", + "esnext" + ], "importHelpers": true, "declaration": true, "sourceMap": true, @@ -15,9 +21,14 @@ "moduleResolution": "node", "baseUrl": "src", "paths": { - "*": ["src/*", "node_modules/*"] + "*": [ + "src/*", + "node_modules/*" + ] }, "jsx": "react", - "esModuleInterop": true + "esModuleInterop": true, + "resolveJsonModule": true, + "downlevelIteration": true } } diff --git a/yarn.lock b/yarn.lock index c984f4c..d2818c0 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3256,6 +3256,13 @@ eslint-plugin-prettier@^3.1.0: dependencies: prettier-linter-helpers "^1.0.0" +eslint-plugin-prettier@^3.1.4: + version "3.1.4" + resolved "https://registry.yarnpkg.com/eslint-plugin-prettier/-/eslint-plugin-prettier-3.1.4.tgz#168ab43154e2ea57db992a2cd097c828171f75c2" + integrity sha512-jZDa8z76klRqo+TdGDTFJSavwbnWK2ZpqGKNZ+VvweMW516pDUMmQ2koXvxEE4JhzNvTv+radye/bWGBmA6jmg== + dependencies: + prettier-linter-helpers "^1.0.0" + eslint-plugin-react-hooks@^2.2.0: version "2.5.1" resolved "https://registry.yarnpkg.com/eslint-plugin-react-hooks/-/eslint-plugin-react-hooks-2.5.1.tgz#4ef5930592588ce171abeb26f400c7fbcbc23cd0"