topic manager should retry
frairon opened this issue · comments
in rare cases, topic creation can get delayed, so the topic manager should retry when verifying a topic, like this:
func (m *topicManager) getTopicConfigMap(topic string) (map[string]sarama.ConfigEntry, error) {
var (
err error
cfg []sarama.ConfigEntry
)
err = withRetry(func() error {
cfg, err = m.admin.DescribeConfig(sarama.ConfigResource{
Type: sarama.TopicResource,
Name: topic,
})
return err
}, "get topic config", 3, NewSimpleBackoff(1*time.Second, 1*time.Second))
// now it does not exist anymore -- this means the cluster is somehow unstable
if err != nil {
return nil, fmt.Errorf("Error getting config for topic %s: %w", topic, err)
}
// remap the config values to a map
cfgMap := make(map[string]sarama.ConfigEntry, len(cfg))
for _, cfgEntry := range cfg {
cfgMap[cfgEntry.Name] = cfgEntry
}
return cfgMap, nil
}
func withRetry(actor func() error, message string, retries int, backoff Backoff) error {
var errors *multierror.Error
for retries > 0 {
err := actor()
if err != nil {
errors = multierror.Append(errors, err)
} else {
return nil
}
time.Sleep(backoff.Duration())
retries--
}
return multierror.Append(errors, fmt.Errorf("%s failed with too many retries", message))
}
see if we can use a library to do the retry instead of a custom solution