BUG: External ETCD for Remote Standby.

Question

BUG: External ETCD for Remote Standby.

PGPAWAN opened this issue a year ago · comments

When trying to use external ETCD with Zalando Operator is bring up the pod(Remote Replica) with Standby leader role whereas same is showing master role when we used K8 native DCS.

Is this expected ? Doesn't Standby Pod using remote replica using S3?

Patroni Yaml using K8 Native DCS.

bootstrap:
  dcs:
    failsafe_mode: false
    loop_wait: 10
    maximum_lag_on_failover: 33554432
    postgresql:
      parameters:
        archive_mode: 'on'
        archive_timeout: 1800s
        autovacuum_analyze_scale_factor: 0.02
        autovacuum_max_workers: 5
        autovacuum_vacuum_scale_factor: 0.05
        checkpoint_completion_target: 0.9
        effective_cache_size: 0MB
        hot_standby: 'on'
        log_autovacuum_min_duration: 0
        log_checkpoints: 'on'
        log_connections: 'on'
        log_disconnections: 'on'
        log_line_prefix: '%t [%p]: [%l-1] %c %x %d %u %a %h '
        log_lock_waits: 'on'
        log_min_duration_statement: 500
        log_rotation_size: '100000'
        log_statement: ddl
        log_temp_files: 0
        maintenance_work_mem: 0MB
        max_connections: '300'
        max_replication_slots: 10
        max_wal_senders: 10
        tcp_keepalives_idle: 900
        tcp_keepalives_interval: 100
        track_functions: all
        wal_compression: 'on'
        wal_level: hot_standby
        wal_log_hints: 'on'
        work_mem: 1MB
      use_pg_rewind: true
      use_slots: true
    retry_timeout: 10
    standby_cluster:
      create_replica_methods:
      - bootstrap_standby_with_wale
      - basebackup_fast_xlog
      restore_command: envdir "/run/etc/wal-e.d/env-standby" timeout "0" /scripts/restore_command.sh "%f" "%p"
    ttl: 30
  initdb:
  - auth-host: md5
  - auth-local: trust
  - data-checksums
  - encoding: UTF8
  - locale: en_US.UTF-8
  post_init: /scripts/post_init.sh "zalandos"
  users:
    zalandos:
      options:
      - CREATEDB
      - NOLOGIN
      password: ''
kubernetes:
  bypass_api_service: true
  labels:
    application: spilo
  port: tcp://10.23.0.1:443
  port_443_tcp: tcp://10.23.0.1:443
  port_443_tcp_addr: 10.23.0.1
  port_443_tcp_port: '443'
  port_443_tcp_proto: tcp
  ports:
  - name: postgresql
    port: 5432
  role_label: spilo-role
  scope_label: cluster-name
  service_host: 10.23.0.1
  service_port: '443'
  service_port_https: '443'
  use_endpoints: true
namespace: pg-pgteststage2
postgresql:
  authentication:
    replication:
      password: Password
      username: standby
    superuser:
      password: Password
      username: postgres
  basebackup_fast_xlog:
    command: /scripts/basebackup.sh
    retries: 2
  bin_dir: /usr/lib/postgresql/14/bin
  bootstrap_standby_with_wale:
    command: envdir "/run/etc/wal-e.d/env-standby" bash /scripts/wale_restore.sh
    no_master: 1
    retries: 2
    threshold_backup_size_percentage: 30
    threshold_megabytes: 102400
  callbacks:
    on_role_change: /scripts/on_role_change.sh zalandos true
  connect_address: 10.22.63.71:5432
  create_replica_method:
  - basebackup_fast_xlog
  data_dir: /home/postgres/pgdata/pgroot/data
  listen: '*:5432'
  name: pg-pgteststage2-0
  parameters:
    archive_command: /bin/true
    bg_mon.history_buckets: 120
    bg_mon.listen_address: 0.0.0.0
    extwlist.custom_path: /scripts
    extwlist.extensions: btree_gin,btree_gist,citext,extra_window_functions,first_last_agg,hll,hstore,hypopg,intarray,ltree,pgcrypto,pgq,pgq_node,pg_trgm,postgres_fdw,tablefunc,uuid-ossp,timescaledb,pg_partman
    log_destination: csvlog
    log_directory: ../pg_log
    log_file_mode: '0644'
    log_filename: postgresql-%u.log
    log_rotation_age: 1d
    log_truncate_on_rotation: 'on'
    logging_collector: 'on'
    pg_stat_statements.track_utility: 'off'
    shared_buffers: 0MB
    shared_preload_libraries: bg_mon,pg_stat_statements,pgextwlist,pg_auth_mon,set_user,pg_cron,pg_stat_kcache,pgaudit
    ssl: 'on'
    ssl_ca_file: /tls/ca.crt
    ssl_cert_file: /tls/tls.crt
    ssl_key_file: /tls/tls.key
  pg_hba:
  - hostssl replication     standby     all             md5
  - hostssl all             postgres    localhost       md5
  - hostssl all             postgres    all             md5
  - local   all             postgres                    peer
  - local   all             all                         md5
  - local   replication     standby                     md5
  - hostssl all             all         all             md5 clientcert=verify-ca
  pgpass: /run/postgresql/pgpass
  use_unix_socket: true
  use_unix_socket_repl: true
restapi:
  connect_address: 10.22.63.71:8008
  listen: :8008
scope: pg-pgteststage2

Patroni Yaml using external ETCD.


bootstrap:
  dcs:
    failsafe_mode: false
    loop_wait: 10
    maximum_lag_on_failover: 33554432
    postgresql:
      parameters:
        archive_mode: 'on'
        archive_timeout: 1800s
        autovacuum_analyze_scale_factor: 0.02
        autovacuum_max_workers: 5
        autovacuum_vacuum_scale_factor: 0.05
        checkpoint_completion_target: 0.9
        effective_cache_size: 0MB
        hot_standby: 'on'
        log_autovacuum_min_duration: 0
        log_checkpoints: 'on'
        log_connections: 'on'
        log_disconnections: 'on'
        log_line_prefix: '%t [%p]: [%l-1] %c %x %d %u %a %h '
        log_lock_waits: 'on'
        log_min_duration_statement: 500
        log_rotation_size: '100000'
        log_statement: ddl
        log_temp_files: 0
        maintenance_work_mem: 0MB
        max_connections: '300'
        max_replication_slots: 10
        max_wal_senders: 10
        tcp_keepalives_idle: 900
        tcp_keepalives_interval: 100
        track_functions: all
        wal_compression: 'on'
        wal_level: hot_standby
        wal_log_hints: 'on'
        work_mem: 1MB
      use_pg_rewind: true
      use_slots: true
    retry_timeout: 10
    standby_cluster:
      create_replica_methods:
      - bootstrap_standby_with_wale
      - basebackup_fast_xlog
      restore_command: envdir "/run/etc/wal-e.d/env-standby" timeout "0" /scripts/restore_command.sh "%f" "%p"
    ttl: 30
  initdb:
  - auth-host: md5
  - auth-local: trust
  - data-checksums
  - encoding: UTF8
  - locale: en_US.UTF-8
  post_init: /scripts/post_init.sh "zalandos"
  users:
    zalandos:
      options:
      - CREATEDB
      - NOLOGIN
      password: ''
etcd:
  host: etcd.postgres-etcd1.svc.cluster.local:2379
  use_endpoints: true
postgresql:
  authentication:
    replication:
      password: Password
      username: standby
    superuser:
      password: Password
      username: postgres
  basebackup_fast_xlog:
    command: /scripts/basebackup.sh
    retries: 2
  bin_dir: /usr/lib/postgresql/14/bin
  bootstrap_standby_with_wale:
    command: envdir "/run/etc/wal-e.d/env-standby" bash /scripts/wale_restore.sh
    no_master: 1
    retries: 2
    threshold_backup_size_percentage: 30
    threshold_megabytes: 102400
  callbacks:
    on_role_change: /scripts/on_role_change.sh zalandos python3 /scripts/callback_role.py
    on_start: python3 /scripts/callback_role.py
    on_stop: python3 /scripts/callback_role.py
  connect_address: 10.22.50.27:5432
  create_replica_method:
  - wal_e
  - basebackup_fast_xlog
  data_dir: /home/postgres/pgdata/pgroot/data
  listen: '*:5432'
  name: pg-pgteststage2-0
  parameters:
    archive_command: envdir "/run/etc/wal-e.d/env" wal-g wal-push "%p"
    bg_mon.history_buckets: 120
    bg_mon.listen_address: 0.0.0.0
    extwlist.custom_path: /scripts
    extwlist.extensions: btree_gin,btree_gist,citext,extra_window_functions,first_last_agg,hll,hstore,hypopg,intarray,ltree,pgcrypto,pgq,pgq_node,pg_trgm,postgres_fdw,tablefunc,uuid-ossp,timescaledb,pg_partman
    log_destination: csvlog
    log_directory: ../pg_log
    log_file_mode: '0644'
    log_filename: postgresql-%u.log
    log_rotation_age: 1d
    log_truncate_on_rotation: 'on'
    logging_collector: 'on'
    pg_stat_statements.track_utility: 'off'
    shared_buffers: 0MB
    shared_preload_libraries: bg_mon,pg_stat_statements,pgextwlist,pg_auth_mon,set_user,pg_cron,pg_stat_kcache,pgaudit
    ssl: 'on'
    ssl_ca_file: /tls/ca.crt
    ssl_cert_file: /tls/tls.crt
    ssl_key_file: /tls/tls.key
  pg_hba:
  - hostssl replication     standby     all             md5
  - hostssl all             postgres    localhost       md5
  - hostssl all             postgres    all             md5
  - local   all             postgres                    peer
  - local   all             all                         md5
  - local   replication     standby                     md5
  - hostssl all             all         all             md5 clientcert=verify-ca
  pgpass: /run/postgresql/pgpass
  recovery_conf:
    restore_command: envdir "/run/etc/wal-e.d/env" timeout "0" /scripts/restore_command.sh "%f" "%p"
  use_unix_socket: true
  use_unix_socket_repl: true
  wal_e:
    command: envdir /run/etc/wal-e.d/env bash /scripts/wale_restore.sh
    no_master: 1
    retries: 2
    threshold_backup_size_percentage: 30
    threshold_megabytes: 102400
restapi:
  connect_address: 10.22.50.27:8008
  listen: :8008
scope: pg-pgteststage2

PGPAWAN · Answer 1 · Fri Aug 25 2023 14:19:24 GMT+0800 (China Standard Time)

created a question #906

Because when we are running the below script its correcting the pod role as expected master role.

postgres@pg-pgteststage2-0:/scripts$ python3 callback_role.py on_role_change master pg-pgteststage2