import { Options, Response } from "got";
import { CheerioOptions, Agent, AgentOptions } from "./agent";
import { Crawler } from "./crawler";
import createX from "./x";
import * as plugins from "./plugins";
import { Plugin } from "./plugin";
/**
* Use x to do everything!
* x is a function to enqueue a `Task`(or url string in spawner mode) with optional `TaskMeta`.
*```js
* const x = require('crawlx').default;
* const x_my = x.create({concurrency:1});
* x_my({url:"https://www.google.com", parse:"title"})
*```
*
*/
export interface X {
/**
* task
*/
(task: Task | string, meta?: TaskMeta): Promise<Task>;
/**
* Create a new instace by merging options.
*/
create: (options?: CrawlerOptions) => X;
/**
* Access to crawler instance.
*/
crawler: Crawler;
/**
* Got agent.
*
*/
agent: Agent;
/**
* Got agent's options
*/
agentOptions: AgentOptions;
/**
* Storage object for the crawler.
*/
store: object;
/**
* Register a plugin.
* @param {Plugin} plugin
*/
use: (plugin: Plugin) => Promise<any>;
/**
* Register a spawner
*/
spawner: (spawner: Spawner) => any;
[prop: string]: any;
}
/**
* Rules for generating a new Task from url and taskMeta.
* ```js
* x.spawner({
* regex: /forum\/page-\d+/,
* spawn: (url, meta)=>({
* url,
* parse: "title",
* callback(task){
* console.log(task.res.statusCode);
* }
* })
* })
*
* x("http://example.com/forum/page-2", {metakey:'metavalue'})
* ```
*/
export interface Spawner {
/**
* Regex expression for validating urls.
*/
regex?: RegExp;
/**
* Direct function for validating urls.
*/
validator?: (url: string, meta: TaskMeta) => boolean;
/**
* Spawn function should return a task object.
*/
spawn?: (url: string, meta: TaskMeta) => Task;
}
/**
* Options to initialize crawler instance or create new instance.
*/
export interface CrawlerOptions {
/**
* Set concurrency limit(default:5)
*/
concurrency?: number;
/**
* Passed to got. See [got](https://github.com/sindresorhus/got).
*/
got?: AgentOptions;
cheerio?: CheerioOptions;
manager?: { autoStart: boolean };
drain?: (crawler: Crawler) => any;
/**
* Register filter-functions for plugin-parse.
*/
filters?: object;
/**
* Attempt rule for plugin-retry(default: 0).
*/
attempts?: AttempRule;
[prop: string]: any;
}
/**
* TaskMeta is an object for containing essential information about the task.
* It is quite important for spawner mode and plugin's data storage.
*/
export interface TaskMeta {
/**
* Auto generated unique string ID.
*/
id?: string;
[prop: string]: any;
}
/**
* Task holds information to execute.
* This is non-spawned task structure as an object.
*/
export interface Task extends Options {
/**
* Url for http request.
*/
url: string;
/**
* Callback function. Called before resolving the task.
*/
callback?: (task: Task, crawler: Crawler) => any;
/**
* Meta object for task.
*/
meta?: TaskMeta;
/**
* Task with higher priority will be executed first.
*/
priority?: number;
spawned?: boolean;
/**
* Setting cancel to `true` will make crawler stop the execution of the task immediately.
*/
cancel?: true;
/**
* Response hooked after http request. See [got](https://github.com/sindresorhus/got)
*/
res?: Response;
/**
* Error hooked after an error throwed during http request.
*/
err?: Error;
/**
* parsing rule for plugin-parse.
*/
parse?: any;
/**
* check before parsing if it exists,
* return true if you want to continue the parsing execution
*/
parseCheck?: (res: Response) => boolean;
/**
* parsed result from plugin-parse.
*/
parsed?: any;
/**
* delay in ms for plugin-delay.
*/
delay?: number;
/**
* rule for plugin-follow: `[selector, taskFactoryFunc, filter=urls=>urls.filter(v=>v)]`.
* workflow:selector-parse => filterFunction => taskFactoryFunc for each
*/
follow?: FollowRule;
/**
* set of rules for plugin-follow.
*/
follows?: FollowRules;
/**
* `retries` or `[retries, allowedStatuses, callback({ err, shouldRetry, task, crawler })]`.
* Rule for plugin-attempt
*/
attempts?: AttempRule;
[prop: string]: any;
}
export type FollowRule =
| [string]
| [string, (u: any) => Task | string | Array<any>]
| [
string,
(u: any) => Task | string | Array<any>,
(parsed: Array<any>) => Array<any>
];
export type FollowRules = Array<FollowRule>;
export type AttempRule =
| number
| [number, Array<number>]
| [
number,
Array<number>,
(options: {
err: Error;
shouldRetry: boolean;
task: Task;
crawler: Crawler;
}) => any
];
const x = createX();
export default x;
export { plugins };
module.exports = x;
module.exports.default = x;
module.exports.plugins = plugins;
Source