From 817c0252c8758161ce7e5b223efeb6c3dcdf165a Mon Sep 17 00:00:00 2001 From: kev Date: Sat, 20 Jun 2015 16:45:20 +0800 Subject: [PATCH] update --- scrapyd/Dockerfile | 1 - scrapyd/README.md | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/scrapyd/Dockerfile b/scrapyd/Dockerfile index 61c29d6..a3395d3 100644 --- a/scrapyd/Dockerfile +++ b/scrapyd/Dockerfile @@ -37,7 +37,6 @@ RUN apt-get update \ && curl -sSL https://bootstrap.pypa.io/get-pip.py | python \ && pip install git+https://github.com/scrapy/scrapy.git \ git+https://github.com/scrapy/scrapyd.git \ - service_identity \ && curl -sSL https://github.com/scrapy/scrapy/raw/master/extras/scrapy_bash_completion -o /etc/bash_completion.d/scrapy_bash_completion \ && echo 'source /etc/bash_completion.d/scrapy_bash_completion' >> /root/.bashrc \ && apt-get remove -y autoconf \ diff --git a/scrapyd/README.md b/scrapyd/README.md index 812aa43..2616b18 100644 --- a/scrapyd/README.md +++ b/scrapyd/README.md @@ -1,8 +1,52 @@ scrapyd ======= -Dockerfile for building an image that runs [scrapyd][1]. +Dockerfile for building an image that runs [scrapy][1] and [scrapyd][2]. + +Only two latest python packages are installed: + +- `scrapy`: git+https://github.com/scrapy/scrapy.git +- `scrapyd`: git+https://github.com/scrapy/scrapyd.git Please use this image as base for your own project. -[1]: https://github.com/scrapy/scrapyd +## run as background-daemon for scrapyd + +``` +$ docker run -d --restart always --name scrapyd -p 6800:6800 vimagick/scrapyd +$ firefox http://localhost:6800 +``` + +## run as interactive-shell for scrapy + +``` +$ cat > stackoverflow_spider.py << _EOF_ +import scrapy + +class StackOverflowSpider(scrapy.Spider): + name = 'stackoverflow' + start_urls = ['http://stackoverflow.com/questions?sort=votes'] + + def parse(self, response): + for href in response.css('.question-summary h3 a::attr(href)'): + full_url = response.urljoin(href.extract()) + yield scrapy.Request(full_url, callback=self.parse_question) + + def parse_question(self, response): + yield { + 'title': response.css('h1 a::text').extract()[0], + 'votes': response.css('.question .vote-count-post::text').extract()[0], + 'body': response.css('.question .post-text').extract()[0], + 'tags': response.css('.question .post-tag::text').extract(), + 'link': response.url, + } +_EOF_ + +$ docker run --rm --name scrapy -v `pwd`:/code -w /code vimagick/scrapyd bash +>>> scrapy runspider stackoverflow_spider.py -o top-stackoverflow-questions.json +>>> cat top-stackoverflow-questions.json +>>> exit +``` + +[1]: https://github.com/scrapy/scrapy +[2]: https://github.com/scrapy/scrapyd