From 30e5f414863c994f186641650655eb0f8d0244f3 Mon Sep 17 00:00:00 2001 From: kev Date: Wed, 8 Mar 2023 15:03:19 +0800 Subject: [PATCH] add crawlee --- README.md | 6 ++++++ crawlee/.dockerignore | 4 ++++ crawlee/Dockerfile | 29 +++++++++++++++++++++++++++++ crawlee/README.md | 29 +++++++++++++++++++++++++++++ crawlee/docker-compose.yml | 7 +++++++ crawlee/main.js | 35 +++++++++++++++++++++++++++++++++++ crawlee/package.json | 14 ++++++++++++++ 7 files changed, 124 insertions(+) create mode 100644 crawlee/.dockerignore create mode 100644 crawlee/Dockerfile create mode 100644 crawlee/README.md create mode 100644 crawlee/docker-compose.yml create mode 100644 crawlee/main.js create mode 100644 crawlee/package.json diff --git a/README.md b/README.md index d847378..cc0f2c7 100644 --- a/README.md +++ b/README.md @@ -303,6 +303,12 @@ A collection of delicious docker recipes. - [x] adguard/adguardhome - [x] ghcr.io/linuxserver/airsonic :musical_note: +- [x] apify/actor-node + - [x] apify/actor-node-puppeteer-chrome + - [x] apify/actor-node-playwright + - [x] apify/actor-node-playwright-chrome + - [x] apify/actor-node-playwright-firefox + - [x] apify/actor-node-playwright-webkit - [x] archivebox/archivebox - [x] docker.bintray.io/jfrog/artifactory-oss - [x] jeffail/benthos diff --git a/crawlee/.dockerignore b/crawlee/.dockerignore new file mode 100644 index 0000000..f5b0890 --- /dev/null +++ b/crawlee/.dockerignore @@ -0,0 +1,4 @@ +Dockerfile +README.md +docker-compose.yml +data/ diff --git a/crawlee/Dockerfile b/crawlee/Dockerfile new file mode 100644 index 0000000..4018d6b --- /dev/null +++ b/crawlee/Dockerfile @@ -0,0 +1,29 @@ +# Specify the base Docker image. You can read more about +# the available images at https://crawlee.dev/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node:16 + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY package*.json ./ + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --omit=dev --omit=optional \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY . ./ + + +# Run the image. +CMD npm start --silent diff --git a/crawlee/README.md b/crawlee/README.md new file mode 100644 index 0000000..616b5c2 --- /dev/null +++ b/crawlee/README.md @@ -0,0 +1,29 @@ +crawlee +======= + +[Crawlee][1] is a web scraping and browser automation library Crawlee is a web +scraping and browser automation library. + +```bash +$ docker-compose build +Building crawlee +Successfully built xxxxxxxxxxxx +Successfully tagged crawlee:latest + +$ docker-compose run --rm crawlee +INFO BasicCrawler: Starting the crawl +INFO BasicCrawler: Processing ... +Crawler finished. + +$ tree data +├── datasets +│   └── default +│   ├── 000000001.json +│   ├── 000000002.json +│   ├── 000000003.json +│   └── 000000004.json +├── key_value_stores +└── request_queues +``` + +[1]: https://crawlee.dev/ diff --git a/crawlee/docker-compose.yml b/crawlee/docker-compose.yml new file mode 100644 index 0000000..81f2dd7 --- /dev/null +++ b/crawlee/docker-compose.yml @@ -0,0 +1,7 @@ +version: "3.8" +services: + crawlee: + image: crawlee + build: . + volumes: + - ./data:/usr/src/app/storage diff --git a/crawlee/main.js b/crawlee/main.js new file mode 100644 index 0000000..f1e4584 --- /dev/null +++ b/crawlee/main.js @@ -0,0 +1,35 @@ +import { BasicCrawler, Dataset } from 'crawlee'; + +// Create a BasicCrawler - the simplest crawler that enables +// users to implement the crawling logic themselves. +const crawler = new BasicCrawler({ + // This function will be called for each URL to crawl. + async requestHandler({ request, sendRequest, log }) { + const { url } = request; + log.info(`Processing ${url}...`); + + // Fetch the page HTML via the crawlee sendRequest utility method + // By default, the method will use the current request that is being handled, so you don't have to + // provide it yourself. You can also provide a custom request if you want. + const { body } = await sendRequest(); + + // Store the HTML and URL to the default dataset. + await Dataset.pushData({ + url, + html: body, + }); + }, +}); + +// The initial list of URLs to crawl. Here we use just a few hard-coded URLs. +await crawler.addRequests([ + 'https://www.google.com', + 'https://www.example.com', + 'https://www.bing.com', + 'https://www.wikipedia.com', +]); + +// Run the crawler and wait for it to finish. +await crawler.run(); + +console.log('Crawler finished.'); diff --git a/crawlee/package.json b/crawlee/package.json new file mode 100644 index 0000000..adb49ff --- /dev/null +++ b/crawlee/package.json @@ -0,0 +1,14 @@ +{ + "description": "Crawlee Demo Project", + "version": "0.0.1", + "license": "UNLICENSED", + "type": "module", + "main": "main.js", + "scripts": { + "start": "node main.js" + }, + "dependencies": { + "crawlee": "*" + }, + "repository": {} +}