mirror of https://github.com/vimagick/dockerfiles
add crawlee
This commit is contained in:
parent
724f6c45e3
commit
30e5f41486
|
@ -303,6 +303,12 @@ A collection of delicious docker recipes.
|
|||
|
||||
- [x] adguard/adguardhome
|
||||
- [x] ghcr.io/linuxserver/airsonic :musical_note:
|
||||
- [x] apify/actor-node
|
||||
- [x] apify/actor-node-puppeteer-chrome
|
||||
- [x] apify/actor-node-playwright
|
||||
- [x] apify/actor-node-playwright-chrome
|
||||
- [x] apify/actor-node-playwright-firefox
|
||||
- [x] apify/actor-node-playwright-webkit
|
||||
- [x] archivebox/archivebox
|
||||
- [x] docker.bintray.io/jfrog/artifactory-oss
|
||||
- [x] jeffail/benthos
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
Dockerfile
|
||||
README.md
|
||||
docker-compose.yml
|
||||
data/
|
|
@ -0,0 +1,29 @@
|
|||
# Specify the base Docker image. You can read more about
|
||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||
# You can also use any other image from Docker Hub.
|
||||
FROM apify/actor-node:16
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY package*.json ./
|
||||
|
||||
# Install NPM packages, skip optional and development dependencies to
|
||||
# keep the image small. Avoid logging too much and print the dependency
|
||||
# tree for debugging
|
||||
RUN npm --quiet set progress=false \
|
||||
&& npm install --omit=dev --omit=optional \
|
||||
&& echo "Installed NPM packages:" \
|
||||
&& (npm list --omit=dev --all || true) \
|
||||
&& echo "Node.js version:" \
|
||||
&& node --version \
|
||||
&& echo "NPM version:" \
|
||||
&& npm --version
|
||||
|
||||
# Next, copy the remaining files and directories with the source code.
|
||||
# Since we do this after NPM install, quick build will be really fast
|
||||
# for most source file changes.
|
||||
COPY . ./
|
||||
|
||||
|
||||
# Run the image.
|
||||
CMD npm start --silent
|
|
@ -0,0 +1,29 @@
|
|||
crawlee
|
||||
=======
|
||||
|
||||
[Crawlee][1] is a web scraping and browser automation library Crawlee is a web
|
||||
scraping and browser automation library.
|
||||
|
||||
```bash
|
||||
$ docker-compose build
|
||||
Building crawlee
|
||||
Successfully built xxxxxxxxxxxx
|
||||
Successfully tagged crawlee:latest
|
||||
|
||||
$ docker-compose run --rm crawlee
|
||||
INFO BasicCrawler: Starting the crawl
|
||||
INFO BasicCrawler: Processing ...
|
||||
Crawler finished.
|
||||
|
||||
$ tree data
|
||||
├── datasets
|
||||
│ └── default
|
||||
│ ├── 000000001.json
|
||||
│ ├── 000000002.json
|
||||
│ ├── 000000003.json
|
||||
│ └── 000000004.json
|
||||
├── key_value_stores
|
||||
└── request_queues
|
||||
```
|
||||
|
||||
[1]: https://crawlee.dev/
|
|
@ -0,0 +1,7 @@
|
|||
version: "3.8"
|
||||
services:
|
||||
crawlee:
|
||||
image: crawlee
|
||||
build: .
|
||||
volumes:
|
||||
- ./data:/usr/src/app/storage
|
|
@ -0,0 +1,35 @@
|
|||
import { BasicCrawler, Dataset } from 'crawlee';
|
||||
|
||||
// Create a BasicCrawler - the simplest crawler that enables
|
||||
// users to implement the crawling logic themselves.
|
||||
const crawler = new BasicCrawler({
|
||||
// This function will be called for each URL to crawl.
|
||||
async requestHandler({ request, sendRequest, log }) {
|
||||
const { url } = request;
|
||||
log.info(`Processing ${url}...`);
|
||||
|
||||
// Fetch the page HTML via the crawlee sendRequest utility method
|
||||
// By default, the method will use the current request that is being handled, so you don't have to
|
||||
// provide it yourself. You can also provide a custom request if you want.
|
||||
const { body } = await sendRequest();
|
||||
|
||||
// Store the HTML and URL to the default dataset.
|
||||
await Dataset.pushData({
|
||||
url,
|
||||
html: body,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
|
||||
await crawler.addRequests([
|
||||
'https://www.google.com',
|
||||
'https://www.example.com',
|
||||
'https://www.bing.com',
|
||||
'https://www.wikipedia.com',
|
||||
]);
|
||||
|
||||
// Run the crawler and wait for it to finish.
|
||||
await crawler.run();
|
||||
|
||||
console.log('Crawler finished.');
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"description": "Crawlee Demo Project",
|
||||
"version": "0.0.1",
|
||||
"license": "UNLICENSED",
|
||||
"type": "module",
|
||||
"main": "main.js",
|
||||
"scripts": {
|
||||
"start": "node main.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"crawlee": "*"
|
||||
},
|
||||
"repository": {}
|
||||
}
|
Loading…
Reference in New Issue