cannot parse structure information of Linux kernel module - ELF format
VVvector opened this issue · comments
Using pyelftools 0.31
I want to use the pyelftools to parse the structure information in Linux kernel module ELF binary, but the following error occurs:
parse_top_die_by_cu(dwarfinfo)
File "Z:\linux_module_driver_github\linux_module_driver\test\pyelf.py", line 94, in parse_top_die_by_cu
die_info_rec(child)
File "Z:\linux_module_driver_github\linux_module_driver\test\pyelf.py", line 72, in die_info_rec
die_info_rec(child, name)
File "Z:\linux_module_driver_github\linux_module_driver\test\pyelf.py", line 54, in die_info_rec
member_type = die_type_rec(die, None)
File "Z:\linux_module_driver_github\linux_module_driver\test\pyelf.py", line 46, in die_type_rec
ref_die = dwarfinfo.get_DIE_from_refaddr(ref)
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\dwarfinfo.py", line 168, in get_DIE_from_refaddr
return cu.get_DIE_from_refaddr(refaddr)
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\compileunit.py", line 130, in get_DIE_from_refaddr
return self._get_cached_DIE(refaddr)
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\compileunit.py", line 241, in _get_cached_DIE
die = DIE(cu=self, stream=top_die_stream, offset=offset)
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\die.py", line 98, in __init__
self._parse_DIE()
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\die.py", line 241, in _parse_DIE
abbrev_decl = self.cu.get_abbrev_table().get_abbrev(self.abbrev_code)
File "C:\Users\vec\AppData\Roaming\Python\Python38\site-packages\elftools\dwarf\abbrevtable.py", line 36, in get_abbrev
return self._abbrev_map[code]
KeyError: 101
Reproduce
- Download the attached Linux kernel module ELF binary
- Put the downloaded Linux kernel module ELF binary in the same directory as the following script
- Execute the following script to parse the Linux kernel module ELF binary
import argparse
import json
import os
from collections import defaultdict
from typing import Optional
from elftools.dwarf.die import DIE
from elftools.elf.elffile import ELFFile
from loguru import logger
logger.add('test.log')
Map_TypePrefix = {
'DW_TAG_base_type': '',
'DW_TAG_structure_type': 'struct.',
'DW_TAG_union_type': 'union.',
'DW_TAG_pointer_type': 'pointer.'
}
Map_AnonTypes = {
'DW_TAG_subroutine_type': 'subroutine',
'DW_TAG_pointer_type': 'pointer',
'DW_TAG_union_type': 'union'
}
# recursive function to get type of a DIE node
def die_type_rec(die: DIE, prev: Optional[DIE]):
t = die.attributes.get("DW_AT_type")
if t is None:
print("***type1***", die)
prefix = '*' if prev.tag == 'DW_TAG_pointer_type' else ''
# got a type
if die.attributes.get("DW_AT_name"):
# common named type with prefix
return prefix + Map_TypePrefix.get(die.tag, f'unknown: {die.tag}') \
+ die.attributes.get("DW_AT_name").value.decode()
elif die.tag == 'DW_TAG_structure_type' and prev.tag == 'DW_TAG_typedef':
# typedef-ed anonymous struct
return prefix + 'struct.' + prev.attributes.get("DW_AT_name").value.decode()
else:
# no name types
return prefix + Map_AnonTypes.get(die.tag, f'unknown: {die.tag}')
elif t.form == 'DW_FORM_ref4':
ref = t.value
ref_die = dwarfinfo.get_DIE_from_refaddr(ref)
return die_type_rec(ref_die, die)
# recursive function to get all struct members
def die_info_rec(die: DIE, name=''):
if die.tag == 'DW_TAG_member' and die.attributes.get("DW_AT_name"):
member_name = die.attributes.get("DW_AT_name").value.decode()
member_type = die_type_rec(die, None)
member_offset = die.attributes.get("DW_AT_data_member_location").value
# save to return data
if member_type.startswith('*'):
# pointer member, change to *name -> type
struct_data[name]['*' + member_name] = member_type[1:]
else:
struct_data[name][member_name] = member_type
print(f' > .{member_name}, type: {member_type}, offset: {member_offset}')
if die.tag == 'DW_TAG_structure_type' and die.attributes.get("DW_AT_name"):
name = 'struct.' + die.attributes.get("DW_AT_name").value.decode()
size = die.attributes.get("DW_AT_byte_size").value
# recursion into all children DIE
for child in die.iter_children():
die_info_rec(child, name)
def parse_top_die_by_cu(dwarfinfo):
j = 0
for CU in dwarfinfo.iter_CUs():
j = j + 1
logger.debug(' Found a compile unit at offset %s, length %s' % (CU.cu_offset, CU['unit_length']))
# Start with the top DIE, the root for this CU's DIE tree
top_DIE = CU.get_top_DIE()
logger.debug("------------------------Top Die[{}] start-----------------------------------------".format(j))
logger.debug(top_DIE)
# Display DIEs recursively starting with top_DIE
i = 0
for child in top_DIE.iter_children():
# for child in CU.iter_DIEs():
i = i + 1
logger.debug("Top Die[{}]->child[{}]:", j, i)
logger.debug(child)
die_info_rec(child)
logger.debug("------------------------Top Die[{}] end-----------------------------------------".format(j))
# dict for all struct members
struct_data = defaultdict(dict)
elf_file = ".\\test.ko"
print('Processing file:', elf_file)
f = open(elf_file, 'rb')
elffile = ELFFile(f)
if not elffile.has_dwarf_info():
print(f'ERROR: input file {elf_file} has no DWARF info')
exit(1)
dwarfinfo = elffile.get_dwarf_info()
parse_top_die_by_cu(dwarfinfo)
f.close()
More
I tried to analyze and found that the value parsing of DW_AT_type in some structures was inconsistent with the data parsed by readelf tool.
e.g.
- parsed by readelf
<2>: Abbrev Number: 3 (DW_TAG_member)
DW_AT_name : (indirect string, offset: 0x3d85): modinfo_attrs
DW_AT_decl_file : 6
DW_AT_decl_line : 379
DW_AT_decl_column : 27
DW_AT_type : <0x68e3>
DW_AT_data_member_location: 176
- parsed by pyelftools:
2024-03-29 22:32:53.114 | DEBUG | main:parse_each_die_by_cu:116 - CU[2]->die[103]:
2024-03-29 22:32:53.117 | DEBUG | main:parse_each_die_by_cu:117 - DIE DW_TAG_member, size=14, has_children=False
|DW_AT_name : AttributeValue(name='DW_AT_name', form='DW_FORM_strp', value=b'modinfo_attrs', raw_value=15749, offset=2895, indirection_length=0)
|DW_AT_decl_file : AttributeValue(name='DW_AT_decl_file', form='DW_FORM_data1', value=6, raw_value=6, offset=2899, indirection_length=0)
|DW_AT_decl_line : AttributeValue(name='DW_AT_decl_line', form='DW_FORM_data2', value=379, raw_value=379, offset=2900, indirection_length=0)
|DW_AT_decl_column : AttributeValue(name='DW_AT_decl_column', form='DW_FORM_data1', value=27, raw_value=27, offset=2902, indirection_length=0)
|DW_AT_type : AttributeValue(name='DW_AT_type', form='DW_FORM_ref4', value=24917, raw_value=24917, offset=2903, indirection_length=0)
|DW_AT_data_member_location: AttributeValue(name='DW_AT_data_member_location', form='DW_FORM_data1', value=176, raw_value=176, offset=2907, indirection_length=0)
For the same DIE, the DW_AT_type obtained by the above two methods is different, one is 0x68e3, and the other is 24917 (0x6155).
Solution: attributes of the form DW_FORM_ref4
are expected to contain the offset of the target DIE relative to the current CU, as opposed to the info section. So the elif
block in die_type_rec()
should go instead:
elif t.form == 'DW_FORM_ref4':
ref = t.value
ref_die = dwarfinfo.get_DIE_from_refaddr(ref + die.cu.cu_offset) # Note the addend!
return die_type_rec(ref_die, die)
Both GNU readelf
and pyelftools' readelf.py display that attribute's value with CU offset already added. It's more useful that way, displays a section relative offset you can search by.
Also, there is get_DIE_from_attribute()
in the DIE object - specifically to follow inter-DIE references.
There are other problems in the script. The assumptions about struct member DIE contents don't hold for bitfield types.
For easier eyeballing of the DWARF, may I recommend DWARF Explorer ( https://github.com/sevaa/dwex ). Full disclosure: I wrote it.
Solution: attributes of the form
DW_FORM_ref4
are expected to contain the offset of the target DIE relative to the current CU, as opposed to the info section. So theelif
block indie_type_rec()
should go instead:elif t.form == 'DW_FORM_ref4': ref = t.value ref_die = dwarfinfo.get_DIE_from_refaddr(ref + die.cu.cu_offset) # Note the addend! return die_type_rec(ref_die, die)
Both GNU
readelf
and pyelftools' readelf.py display that attribute's value with CU offset already added. It's more useful that way, displays a section relative offset you can search by.Also, there is
get_DIE_from_attribute()
in the DIE object - specifically to follow inter-DIE references.There are other problems in the script. The assumptions about struct member DIE contents don't hold for bitfield types.
For easier eyeballing of the DWARF, may I recommend DWARF Explorer ( https://github.com/sevaa/dwex ). Full disclosure: I wrote it.
Thank you so much. I have verified the script with your comments. In addition, I have processed the bitfiled part accordingly.