@@ -49,31 +49,59 @@ import (
4949 log "github.com/sirupsen/logrus"
5050)
5151
52+ // HookFn is a callback function for processing HTML data at a given place in the DOM tree
53+ // The first e argument gives access to the DOM, and the second data argument carries a pointer
54+ // to the data struct you want to save important information in. You can cast data to what's
55+ // returned by Scraper.InitialData(). The return values are an optional string which tells the
56+ // scraper to also scrape an other page, and an error.
5257type HookFn func (e * colly.HTMLElement , data interface {}) (* string , error )
5358
59+ // Hook maps a handler of type HookFn to a DOMPath in the tree. The DOMPath can be any valid CSS selector.
5460type Hook struct {
61+ // DOMPath specifies one or many elements in the DOM tree using a CSS selector
5562 DOMPath string
63+
64+ // Handler specifies the handler to be invoked for all of the elements on the HTML page matched by the CSS selector
5665 Handler HookFn
5766}
5867
68+ // Scraper is an interface which scraping implementations should implement.
69+ // Any struct that satisfies this interface, may be passed to the generic Scrape function in this package.
5970type Scraper interface {
71+ // Name returns an user-friendly name of the scraper
6072 Name () string
6173
74+ // Hooks returns the hooks for all HTML elements that should be matched and their handlers.
6275 Hooks () []Hook
6376
77+ // InitialData returns the struct pointer which is then shared between/passed to all hook handlers.
6478 InitialData () interface {}
6579}
6680
81+ // Extension is an interface which allows for adding extensions on-demand to scraping implementations.
82+ // Upon calling Scrape(), you may pass extra extension implementations in ScrapeOptions. The extension
83+ // can register its own extra hook for processing the DOM. The extension shares/manipulates the same
84+ // data as the Scraper it's used together with.
6785type Extension interface {
86+ // Name returns the name of the extension
6887 Name () string
88+
89+ // Hook is the hook registered by this extension
6990 Hook () Hook
7091}
7192
93+ // ScrapeOptions contains extra parameters used when scraping
7294type ScrapeOptions struct {
95+ // Extensions allows registering extensions to a Scrape() call
7396 Extensions []Extension
74- LogLevel * log.Level
97+ // LogLevel specifies the logrus log level for the Scrape() function
98+ LogLevel * log.Level
7599}
76100
101+ // Scrape takes in a Scraper struct, an URL to scrape, and optionally extra options.
102+ // This function calls handlers from the the Scraper.Hooks() for the given DOM paths, and
103+ // shares the Scraper.InitialData() struct pointer between them. The return value is that
104+ // struct pointer, and/or possibly an error.
77105func Scrape (url string , s Scraper , opts * ScrapeOptions ) (interface {}, error ) {
78106 c := colly .NewCollector ()
79107 mux := & sync.Mutex {}
0 commit comments