whamcloud / iml-agent

Integrated Manager for Lustre Agent

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

_has_link() in lib/corosync.py returns wrong status in case of link state is Unknown

freishutz opened this issue · comments

iml-net-ring1_1

on the host side:

[root@sfa7990-c0 ~]# ip link set up dev enp0s20f0u8u2c2 ; echo $?
0
[root@sfa7990-c0 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
2: eno5: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
    link/ether 00:01:ff:0d:00:40 brd ff:ff:ff:ff:ff:ff
    inet 10.36.44.22/22 brd 10.36.47.255 scope global dynamic eno5
       valid_lft 13617sec preferred_lft 13617sec
    inet6 fe80::201:ffff:fe0d:40/64 scope link 
       valid_lft forever preferred_lft forever
3: eno6: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default qlen 1000
    link/ether 00:01:ff:4d:00:40 brd ff:ff:ff:ff:ff:ff
4: enp0s20f0u8u2c2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN group default qlen 1000
    link/ether 3a:18:3b:7c:53:19 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::3818:3bff:fe7c:5319/64 scope link 
       valid_lft forever preferred_lft forever
5: ib0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 2044 qdisc mq state UP group default qlen 256
    link/infiniband 00:00:07:f9:fe:80:00:00:00:00:00:00:50:6b:4b:03:00:23:b7:cc brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
    inet 172.172.172.22/24 brd 172.172.172.255 scope global ib0
       valid_lft forever preferred_lft forever
    inet6 fe80::526b:4b03:23:b7cc/64 scope link 
       valid_lft forever preferred_lft forever
6: ib1: <BROADCAST,MULTICAST> mtu 4092 qdisc mq state DOWN group default qlen 256
    link/infiniband 00:00:11:b0:fe:80:00:00:00:00:00:00:50:6b:4b:03:00:23:b7:cd brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff

[root@sfa7990-c0 ~]# cat /sys/class/net/enp0s20f0u8u2c2/carrier
1
[root@sfa7990-c0 ~]# cat /sys/class/net/enp0s20f0u8u2c2/operstate 
unknown

root@sfa7990-c1 ~]# for nic in $(ls -1 /sys/class/net/); do echo $nic: $(readlink /sys/class/net/$nic/device/driver); done  
eno5: ../../../../bus/pci/drivers/igb
eno6: ../../../../bus/pci/drivers/igb
enp0s20f0u8u2c2: ../../../../../../../bus/usb/drivers/cdc_ether
ib0: ../../../../bus/pci/drivers/mlx5_core
ib1: ../../../../bus/pci/drivers/mlx5_core
lo:

[root@sfa7990-c1 ~]# cat /sys/class/net/*/device/interface 
CDC Notification Interface

[root@sfa7990-c1 ~]# modinfo cdc_ether
filename:       /lib/modules/3.10.0-957.el7_lustre.x86_64/kernel/drivers/net/usb/cdc_ether.ko.xz
license:        GPL
description:    USB CDC Ethernet devices
author:         David Brownell
retpoline:      Y
rhelversion:    7.6
srcversion:     D329B19ACE6E9677F544BB8

[root@sfa7990-c0 tmp]# python
Python 2.7.5 (default, Oct 30 2018, 23:45:53) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import array, struct, fcntl, socket
>>> SIOCETHTOOL = 0x8946
>>> ETHTOOL_GLINK = 0x0000000a
>>> sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>> struct
struct
>>> ecmd = array.array('B', struct.pack('2I', ETHTOOL_GLINK, 0))
>>> ifreq = struct.pack('16sP', 'enp0s20f0u8u2c2', ecmd.buffer_info()[0])
>>> fcntl.ioctl(sock.fileno(), SIOCETHTOOL, ifreq)
'enp0s20f0u8u2c2\x000\x14J\x01\x00\x00\x00\x00'
>>> sock.close()
>>> struct.unpack('4xI', ecmd.tostring())[0]
1
>>> def _has_link(name):
...     import array
...     import struct
...     import fcntl
...     import socket
...     SIOCETHTOOL = 0x8946
...     ETHTOOL_GLINK = 0x0000000a
...     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
...     ecmd = array.array('B', struct.pack('2I', ETHTOOL_GLINK, 0))
...     ifreq = struct.pack('16sP', name, ecmd.buffer_info()[0])
...     fcntl.ioctl(sock.fileno(), SIOCETHTOOL, ifreq)
...     sock.close()
...     return bool(struct.unpack('4xI', ecmd.tostring())[0])
... 
>>> 
>>> 
>>> 
>>> _has_link('ib0')
True
>>> _has_link('ib1')
False
>>> _has_link('lo')
True
>>> _has_link('enp0s20f0u8u2c2')
True

According to https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net interface could have next states :

What:		/sys/class/net/<iface>/operstate
Date:		March 2006
KernelVersion:	2.6.17
Contact:	netdev@vger.kernel.org
Description:
		Indicates the interface RFC2863 operational state as a string.
		Possible values are:
		"unknown", "notpresent", "down", "lowerlayerdown", "testing",
		"dormant", "up".

I suggest to change code logic in a way when everything except 'up' return False.

@freishutz Can we get access to a system where this is occuring?

Hi @freishutz, I may have missed this but is there a system setup where I can get access to this? Thanks.

Hi @freishutz, I have put together a fix that I believe will resolve this issue. Would it be possible to connect to the same cluster so I can test with the new rpm? I'm also attaching the binary to this ticket in case you want to try it out as well.