crawlx

⚡Lightweight web crawler with powerful plugins!

const x = require("crawlx").default;

x({
  url: "http://quotes.toscrape.com/",
  parse: [
    "[.quote]",
    {
      author: ".author",
      authorUrl: ".author+a@href",
      text: ".text",
      tags: "[a.tag]",
      type: () => "quote"
    },
    s => ((s["crawled"] = new Date()), s)
  ],
  follow: ["[.author+a@href]", followAuthorRule]
}).then(task => {
  console.log(task.parsed);
});

function followAuthorRule(url) {
  return {
    url,
    parse: {
      name: ["h3 | reverse", v => v.toUpperCase()],
      born: ".author-born-date | date"
    },
    callback(task) {
      console.log(task.parsed);
    }
  };
}

Features

Make http request with got
Priority queue of requests
Simple plugin system
Promise support
Flexible schema with powerful parse plugin, using only one rule object
Easily paginate and follow links with builtin follow plugin
Spawner mode: add a url directly

Installation

npm install crawlx

Documentation

Documentaition: crawlx.js.org

See more examples: crawlx/examples