From 30e5f414863c994f186641650655eb0f8d0244f3 Mon Sep 17 00:00:00 2001
From: kev <vimagick@gmail.com>
Date: Wed, 8 Mar 2023 15:03:19 +0800
Subject: [PATCH] add crawlee

---
 README.md                  |  6 ++++++
 crawlee/.dockerignore      |  4 ++++
 crawlee/Dockerfile         | 29 +++++++++++++++++++++++++++++
 crawlee/README.md          | 29 +++++++++++++++++++++++++++++
 crawlee/docker-compose.yml |  7 +++++++
 crawlee/main.js            | 35 +++++++++++++++++++++++++++++++++++
 crawlee/package.json       | 14 ++++++++++++++
 7 files changed, 124 insertions(+)
 create mode 100644 crawlee/.dockerignore
 create mode 100644 crawlee/Dockerfile
 create mode 100644 crawlee/README.md
 create mode 100644 crawlee/docker-compose.yml
 create mode 100644 crawlee/main.js
 create mode 100644 crawlee/package.json

diff --git a/README.md b/README.md
index d847378..cc0f2c7 100644
--- a/README.md
+++ b/README.md
@@ -303,6 +303,12 @@ A collection of delicious docker recipes.
 
 - [x] adguard/adguardhome
 - [x] ghcr.io/linuxserver/airsonic :musical_note:
+- [x] apify/actor-node
+  - [x] apify/actor-node-puppeteer-chrome
+  - [x] apify/actor-node-playwright
+    - [x] apify/actor-node-playwright-chrome
+    - [x] apify/actor-node-playwright-firefox
+    - [x] apify/actor-node-playwright-webkit
 - [x] archivebox/archivebox
 - [x] docker.bintray.io/jfrog/artifactory-oss
 - [x] jeffail/benthos
diff --git a/crawlee/.dockerignore b/crawlee/.dockerignore
new file mode 100644
index 0000000..f5b0890
--- /dev/null
+++ b/crawlee/.dockerignore
@@ -0,0 +1,4 @@
+Dockerfile
+README.md
+docker-compose.yml
+data/
diff --git a/crawlee/Dockerfile b/crawlee/Dockerfile
new file mode 100644
index 0000000..4018d6b
--- /dev/null
+++ b/crawlee/Dockerfile
@@ -0,0 +1,29 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://crawlee.dev/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node:16
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY . ./
+
+
+# Run the image.
+CMD npm start --silent
diff --git a/crawlee/README.md b/crawlee/README.md
new file mode 100644
index 0000000..616b5c2
--- /dev/null
+++ b/crawlee/README.md
@@ -0,0 +1,29 @@
+crawlee
+=======
+
+[Crawlee][1] is a web scraping and browser automation library Crawlee is a web
+scraping and browser automation library.
+
+```bash
+$ docker-compose build
+Building crawlee
+Successfully built xxxxxxxxxxxx
+Successfully tagged crawlee:latest
+
+$ docker-compose run --rm crawlee
+INFO  BasicCrawler: Starting the crawl
+INFO  BasicCrawler: Processing ...
+Crawler finished.
+
+$ tree data
+├── datasets
+│   └── default
+│       ├── 000000001.json
+│       ├── 000000002.json
+│       ├── 000000003.json
+│       └── 000000004.json
+├── key_value_stores
+└── request_queues
+```
+
+[1]: https://crawlee.dev/
diff --git a/crawlee/docker-compose.yml b/crawlee/docker-compose.yml
new file mode 100644
index 0000000..81f2dd7
--- /dev/null
+++ b/crawlee/docker-compose.yml
@@ -0,0 +1,7 @@
+version: "3.8"
+services:
+  crawlee:
+    image: crawlee
+    build: .
+    volumes:
+      - ./data:/usr/src/app/storage
diff --git a/crawlee/main.js b/crawlee/main.js
new file mode 100644
index 0000000..f1e4584
--- /dev/null
+++ b/crawlee/main.js
@@ -0,0 +1,35 @@
+import { BasicCrawler, Dataset } from 'crawlee';
+
+// Create a BasicCrawler - the simplest crawler that enables
+// users to implement the crawling logic themselves.
+const crawler = new BasicCrawler({
+    // This function will be called for each URL to crawl.
+    async requestHandler({ request, sendRequest, log }) {
+        const { url } = request;
+        log.info(`Processing ${url}...`);
+
+        // Fetch the page HTML via the crawlee sendRequest utility method
+        // By default, the method will use the current request that is being handled, so you don't have to
+        // provide it yourself. You can also provide a custom request if you want.
+        const { body } = await sendRequest();
+
+        // Store the HTML and URL to the default dataset.
+        await Dataset.pushData({
+            url,
+            html: body,
+        });
+    },
+});
+
+// The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
+await crawler.addRequests([
+    'https://www.google.com',
+    'https://www.example.com',
+    'https://www.bing.com',
+    'https://www.wikipedia.com',
+]);
+
+// Run the crawler and wait for it to finish.
+await crawler.run();
+
+console.log('Crawler finished.');
diff --git a/crawlee/package.json b/crawlee/package.json
new file mode 100644
index 0000000..adb49ff
--- /dev/null
+++ b/crawlee/package.json
@@ -0,0 +1,14 @@
+{
+    "description": "Crawlee Demo Project",
+    "version": "0.0.1",
+    "license": "UNLICENSED",
+    "type": "module",
+    "main": "main.js",
+    "scripts": {
+        "start": "node main.js"
+    },
+    "dependencies": {
+        "crawlee": "*"
+    },
+    "repository": {}
+}