guyromm / NoflimScraper

crawl and parse the idf.il casualty pages ; קצירה של דפי הנופלים של אתר צה"ל

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

what & why?

  • postgrest is a really nice backend to quickly prototype apps by definining data & permissions, without having to write backend.
  • webextension-toolbox is a cross-browser pleasant dev framework for browser extensions
  • tmux is used to run multiple processes and an editor, because nothing beats tmux and emacs (use web-mode to edit svelte!)

prequisites for dev env

  1. postgrest - install according to instructions.
    1. pgjwt - postgresql jwt extension for postgrest auth
  2. nvm - to easily swap node/npm versions. tested with node v13.11.0.

cloning

npx degit git@git.webgma.co.il:ScraperStarterTemplate

package.json dependencies

nvm use ; ./npm-init.sh

.env

use envs/local.tpl to create an envs/local .env shell file, and then expand/eval it using ./setenv.sh

   function freeport() {
	FROM=$1
	TO=$2
	HOWMANY=$3
	comm -23 \
	     <(seq "$FROM" "$TO" | sort) \
	     <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) \
	    | shuf | head -n "$HOWMANY"
   }
 export APPNAME=$(basename $(pwd))
 export DBNAME=$APPNAME
 export APPPORT=$(freeport 3000 4000 1)
 export POSTGRESTPORT=$[APPPORT+1]
 export SERVERPORT=$[APPPORT+2]
 export EXTENSIONPORT=$[APPPORT+3]
 export JWTSECRET=$(head /dev/urandom | tr -dc A-F0-9 | head -c 64 ; echo '')
 export POSTGREST_CLI_LOGIN="admin@$APPNAME"".com"
 export POSTGREST_CLI_PASS="$(head /dev/urandom | tr -dc a-z0-9 | head -c 16 ; echo '')"
 cp envs/local.tpl envs/local
 sed -i -E "s/APPPORTREPLACE/$APPPORT/g" envs/local
 sed -i -E "s/SERVERPORTREPLACE/$SERVERPORT/g" envs/local
 sed -i -E "s/EXTENSIONPORTREPLACE/$EXTENSIONPORT/g" envs/local
 sed -i -E "s/POSTGRESTPORTREPLACE/$POSTGRESTPORT/g" envs/local
 sed -i -E "s/DBNAMEREPLACE/$DBNAME/g" envs/local
 sed -i -E "s/JWTSECRETREPLACE/$JWTSECRET/g" envs/local
 sed -i -E "s/POSTGRESTCLILOGIN/$POSTGREST_CLI_LOGIN/g" envs/local
 sed -i -E "s/POSTGRESTCLIPASS/$POSTGREST_CLI_PASS/g" envs/local
 echo '* please provide a target domain'
 read TARGETDOMAIN
 sed -i -E "s/TARGETDOMAINREPLACE/$TARGETDOMAIN/g" envs/local
 ./setenv.sh local

deploying on aws

create a separate env

test -f envs/aws || cp envs/local envs/aws
sed -i -E "s/^RDS=''$/RDS=1/g" envs/aws
echo POSTGREST_PATH_AS_ARG=1 >> envs/aws
echo 'VITE_POSTGREST_PATH_AS_ARG=$POSTGREST_PATH_AS_ARG' >> envs/aws
RDS_PASSWORD=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 16 ; echo '')
aws rds describe-db-subnet-groups | tee /tmp/sng.json
egrep '^RDS_VPC_GROUP=(.+)$' envs/aws || echo 'RDS_VPC_GROUP='$(aws rds describe-db-subnet-groups | jq '.DBSubnetGroups[]|select(.DBSubnetGroupName | startswith("default-vpc-")).DBSubnetGroupName' -r) >> envs/aws
egrep '^RDS_PASSWORD=(.+)$' envs/aws || echo "RDS_PASSWORD=$RDS_PASSWORD" >> envs/aws
./setenv.sh aws
lambda/postgrest-download.sh

create the rds

egrep '^RDS_HOSTNAME=(.+)$' envs/aws || (./lambda/rds-create.sh | egrep '^RDS_HOSTNAME=' | tee -a envs/aws
echo 'DBURIADMIN="postgres://postgres:$RDS_PASSWORD@$RDS_HOSTNAME/template1"' | tee -a envs/aws
echo 'DBURI="postgres://postgres:$RDS_PASSWORD@$RDS_HOSTNAME/"' | tee -a envs/aws
)
./setenv.sh aws

create & deploy the lambda func

source .env
egrep '^LAMBDA_ROLE=(.+)$' envs/aws || (
aws iam create-role --role-name $APPNAME --assume-role-policy-document file://lambda/trust-policy.tpl.json | tee envs/$ENV.role.json
aws iam attach-role-policy --role-name $APPNAME --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
)
egrep '^LAMBDA_ROLE=(.+)$' envs/aws && echo 'LAMBDA_ROLE_ALREADY_SET' || (test -f envs/$ENV.role.json && (echo LAMBDA_ROLE=$(jq .Role.Arn envs/$ENV.role.json -r) | tee -a envs/aws) || echo 'NO_ROLE_FILE')
./setenv.sh aws
source .env
lambda/pack.sh
aws lambda create-function --function-name $APPNAME-postgrest --runtime nodejs14.x --role "$LAMBDA_ROLE" --zip-file fileb://lambda/function.zip --handler index.handler --timeout 15 | tee envs/$ENV.lambda.json
egrep '^AWS_POSTGREST_LAMBDA_FUNC=(.+)$' envs/aws || echo 'AWS_POSTGREST_LAMBDA_FUNC='$(jq .FunctionName envs/$ENV.lambda.json -r) | tee -a envs/aws
aws lambda create-function-url-config --function-name $APPNAME-postgrest --auth-type NONE --cors 'AllowOrigins=*' | tee envs/$ENV.lambda.url.json
test -f envs/$ENV.lambda.url.json && sed -i -E 's/^POSTGREST_BASE_URI=(.*)$/POSTGREST_BASE_URI="'$(jq .FunctionUrl envs/$ENV.lambda.url.json -r | sed -E 's/\//\\\//g')'"/g' envs/aws

database initialization

source .env
echo 'DBNAME:'$DBNAME
./db_init.sh

launch

./tmux.sh

backup & restore

generate a zstd-compressed snapshot
./pg_snapshot.sh
  • restore
    as postgres user initialize the db
    psql template1 -c ‘drop database ‘$DBNAME ; psql template1 -c ‘create database ‘$DBNAME && psql template1 -c ‘grant all on database ‘$DBNAME’ to ‘$DBNAME ; psql $DBNAME -c ‘create extension pgcrypto’ ; psql $DBNAME -c ‘create extension pgjwt’ ; psql $DBNAME -c ‘grant anon to ‘DBNAME ; psql $DBNAME -c ‘grant admin to ‘$DBNAME; psql $DBNAME -c “alter database $DBNAME set app.jwt_secret=’”$JWTSECRET”’”
    perform the snapshot restore
    zstdcat kwisatz-2022-08-29.sql.zst | egrep -v ‘pgcrypto|pgjwt’ | ./psql.sh -v ON_ERROR_STOP=1

About

crawl and parse the idf.il casualty pages ; קצירה של דפי הנופלים של אתר צה"ל

License:MIT License


Languages

Language:JavaScript 45.1%Language:Svelte 29.7%Language:PLpgSQL 13.0%Language:Shell 7.3%Language:CSS 1.5%Language:Python 1.5%Language:HTML 1.3%Language:Smarty 0.6%