pipenv --python 3.7.0
pipenv shell
pipenv sync --dev
# pipenv install requests BeautifulSoup4 lxml pyyaml pandas selenium schedule cerberus
# pipenv install oauth2client google-api-python-client google-cloud-storage gspread
# pipenv install "python-dotenv~=0.13.0"
# pipenv install "chromedriver-binary==83.0.4103.39.0"
# pipenv install scrapy ipython
# pipenv install --dev tox flake8 autopep8 pytest coverage black==18.9b0
pipenv run pip freeze > requirements.txt
cat << EOS >> .env
CMS_ID=xxxxxxxx
CMS_PS=xxxxxxxx
LOGIN=xxxxxxxx
ARTICLE=xxxxxxxx
WEB=xxxxxxxx
APP=xxxxxxxx
SLACK_WEBHOOK=https://hooks.slack.com/services/<team_id>/Bxxxxxxxx/xxxxxxxx
SLACK_PINGME=https://hooks.slack.com/services/<team_id>/Bxxxxxxxx/xxxxxxxx
UA=xxxxxxxx
CLIENT_SECRET_FILE=/path/to/client_secret.json
CREDENTIAL_FILE=/path/to/credentials.json
KEY_FILE_LOCATION=/path/to/project-name-xxxxxxx.json
FOLDER_ID=cms_folder_id
REPORT_ID=report_folder_id
FA_ID=fa_folder_id
APP_ID=app_id
ARTCILE_ID=article_spreadsheet_id
ARTICLE_RANGE=xxxxxxxx
TIMELINE_ID=timeline_spreadsheet_id
TIMELINE_RANGE=xxxxxxxx
ANALYTICS_ID=view_id
PROJECT_ID=project_id
FIREBASE_ID=firebase_id
APPLICATION_NAME=xxxxxxxx
PIN=xxxxxxxx
EMAIL=xxxxxxxx
PASSWORD=xxxxxxxx
EOS
# Once you change the .env, you need to `exit` and then `pipenv shell` again to reflect it
# You need to share the google drive folder with the service accounts to create a spredsheet in it
# Cloud functions use [email protected] by default
# the default timeout is 60 sec (you can change it up to 540 sec)
# the default memory is 256MB (you can change it up to 2GB)
# Cron job send a POST request to the Cloud functions
# curl -X POST "https://YOUR_REGION-YOUR_PROJECT_ID.cloudfunctions.net/FUNCTION_NAME" \
# -H "Content-Type:application/json"
gcloud functions deploy main --runtime python37 --trigger-http --timeout=540 --memory=2048MB
brew install oath-toolkit
oathtool --totp --base32 [key (32 characters)]
brew install telnet
pipenv run pip show chromedriver-binary | grep "Version: 83"
pipenv run pip list | grep -e Scrapy -e Twisted -e lxml -e pyOpenSSL
pipenv run pip show requests-html | grep "Requires"
# scrapy project (Global commands)
scrapy -h
scrapy startproject [project_name]
cd [project_name]
scrapy genspider -l
scrapy genspider [-t template_name][spider_name] [domain]
scrapy settings --get BOT_NAME
scrapy runspider [spider_module_name.py] [-o output_file_name(.json|.jl)]
scrapy shell '[domain]' [--nolog]
scrapy crawl [spider_name] -a [tag_name1=value1] -a [tag_name2=value2]
scrapy parse [url] --spider [spider_name] -c [spider_method] # call spider_method from the spider to prase a page
>>> shelp()
>>> help(scrapy.http.Reuqest)
>>> response.css('title')
>>> response.css('title::text').getall()
>>> response.css('title::text').re(r'')
>>> response.css('li.next a').attrib['href']
>>> response.css('li.next a::attr(href)').get()
>>> response.xpath('//title').get()
# scrapy project (Project-only commands)
scrapy edit [spider_name]
scrapy list
scrapy check -l
scrapy crawl [spider_name] [-o output_file_name(.json|.jl)]
scrapy crawl facms -o articles.csv
telnet localhost 6023
>>> from scrapy.utils.trackref import get_oldest
>>> from scrapy.utils.trackref import iter_all
>>> from scrapy.spiders import Spider
>>> prefs()
>>> prefs(ignore=Spider)
>>> r = get_oldest('HtmlResponse')
>>> r.url
>>> [r.url for r in iter_all('HtmlResponse')]
- client_secret.json (Google APIs OAuth 2.0 client ID - APPLICATION_NAME) -> credentials.json
- project-name-xxxxxxx.json (Google APIs service accounts)
- [email protected] (App Engine default service account)
- [email protected] (user-managed service account)
Account Explorer for checking the view id
Core Reporting API - Common Queries
gcloud functions deploy - create or update a Google Cloud Function
set memory and timeout to cloud functions on Console
Calling Cloud functions - HTTP Triggers
ChromeDriver - WebDriver for Chrome
- Configuration settings
- Populating the settings
- scrapy.http.Request.meta
- Request.meta special keys
- Using FormRequest.from_response() to simulate a user login
- scrapy/itemadapter
- Item Pipeline
- Item Exporters
- Feed exports
- Telnet Console
- Core API
- Custom Contracts
- Run Scrapy from a script
- Avoiding getting banned
- Selecting dynamically-loaded content
- Debugging memory leaks
- Downloader Middleware
- Spider Middleware