add crawlee

This commit is contained in:
kev 2023-03-08 15:03:19 +08:00
parent 724f6c45e3
commit 30e5f41486
7 changed files with 124 additions and 0 deletions

View File

@ -303,6 +303,12 @@ A collection of delicious docker recipes.
- [x] adguard/adguardhome
- [x] ghcr.io/linuxserver/airsonic :musical_note:
- [x] apify/actor-node
- [x] apify/actor-node-puppeteer-chrome
- [x] apify/actor-node-playwright
- [x] apify/actor-node-playwright-chrome
- [x] apify/actor-node-playwright-firefox
- [x] apify/actor-node-playwright-webkit
- [x] archivebox/archivebox
- [x] docker.bintray.io/jfrog/artifactory-oss
- [x] jeffail/benthos

4
crawlee/.dockerignore Normal file
View File

@ -0,0 +1,4 @@
Dockerfile
README.md
docker-compose.yml
data/

29
crawlee/Dockerfile Normal file
View File

@ -0,0 +1,29 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

29
crawlee/README.md Normal file
View File

@ -0,0 +1,29 @@
crawlee
=======
[Crawlee][1] is a web scraping and browser automation library Crawlee is a web
scraping and browser automation library.
```bash
$ docker-compose build
Building crawlee
Successfully built xxxxxxxxxxxx
Successfully tagged crawlee:latest
$ docker-compose run --rm crawlee
INFO BasicCrawler: Starting the crawl
INFO BasicCrawler: Processing ...
Crawler finished.
$ tree data
├── datasets
│   └── default
│   ├── 000000001.json
│   ├── 000000002.json
│   ├── 000000003.json
│   └── 000000004.json
├── key_value_stores
└── request_queues
```
[1]: https://crawlee.dev/

View File

@ -0,0 +1,7 @@
version: "3.8"
services:
crawlee:
image: crawlee
build: .
volumes:
- ./data:/usr/src/app/storage

35
crawlee/main.js Normal file
View File

@ -0,0 +1,35 @@
import { BasicCrawler, Dataset } from 'crawlee';
// Create a BasicCrawler - the simplest crawler that enables
// users to implement the crawling logic themselves.
const crawler = new BasicCrawler({
// This function will be called for each URL to crawl.
async requestHandler({ request, sendRequest, log }) {
const { url } = request;
log.info(`Processing ${url}...`);
// Fetch the page HTML via the crawlee sendRequest utility method
// By default, the method will use the current request that is being handled, so you don't have to
// provide it yourself. You can also provide a custom request if you want.
const { body } = await sendRequest();
// Store the HTML and URL to the default dataset.
await Dataset.pushData({
url,
html: body,
});
},
});
// The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
await crawler.addRequests([
'https://www.google.com',
'https://www.example.com',
'https://www.bing.com',
'https://www.wikipedia.com',
]);
// Run the crawler and wait for it to finish.
await crawler.run();
console.log('Crawler finished.');

14
crawlee/package.json Normal file
View File

@ -0,0 +1,14 @@
{
"description": "Crawlee Demo Project",
"version": "0.0.1",
"license": "UNLICENSED",
"type": "module",
"main": "main.js",
"scripts": {
"start": "node main.js"
},
"dependencies": {
"crawlee": "*"
},
"repository": {}
}