hidet-org / hidet

An open-source efficient deep learning framework/compiler, written in python.

Home Page:https://hidet.org

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

[Bug] Repeated include statements within the same source.cu/source.cc file

BolinSNLHM opened this issue · comments

Describe the bug
While working on benchmarking Hidet for CPU support, I noticed that during the tuning process, within a single source.cc/source.cu file, the block of #include statements:

#include <stdint.h>
#include <hidet/runtime/symbols.h>
#include <hidet/runtime/memory_planner.h>
#include <hidet/runtime/cpu/context.h>
#include <hidet/runtime/cuda/complex.h>
#include <hidet/runtime/cuda/context.h>
#include <hidet/runtime/logging.h>

Will appear multiple times, one before each candidate_xx namespace.

You can refer to the attached source.cu file as an example.

To Reproduce
Script:

import hidet
from typing import List
import hidet.testing


hidet.option.cache_dir('./mycache-gpt2')
hidet.option.search_space(2)

def generate_hidet(model, text, input_ids, position_ids, past_keys, past_values, device, tokens_to_generate=10):
    output_ids = []
    for _ in range(tokens_to_generate):
        input_ids, position_ids, past_keys, past_values = model(input_ids, position_ids, past_keys, past_values)
        output_ids.append(input_ids[0].item())

    return output_ids


gpt2_module = hidet.testing.models.gpt2.model(disable_cache=True)
gpt2_module.cuda()

input_ids = hidet.symbol(['seq_length'], dtype=hidet.int32, device='cuda')
position_ids = hidet.symbol(['seq_length'], dtype=hidet.int32, device='cuda')
cache_shape = [gpt2_module.num_hidden_layers, gpt2_module.num_heads, 'prev_seq_length', gpt2_module.head_dim]

past_keys = hidet.symbol(cache_shape, dtype=hidet.float32, device='cuda')
past_values = hidet.symbol(cache_shape, dtype=hidet.float32, device='cuda')

outputs = gpt2_module(input_ids, position_ids, past_keys, past_values)
graph = hidet.trace_from(outputs, inputs=[input_ids, position_ids, past_keys, past_values])
graph = hidet.graph.optimize(graph)

compiled_model = graph.build(space=2)
compiled_model.save('./benchmark_outs2/compiled.hidet')

text = "This is just an example..."


hidet_tokenizer = hidet.testing.models.gpt2.tokenizer()
hidet_input_ids_list: List[int] = hidet_tokenizer(text)['input_ids']
hidet_input_ids = hidet.asarray(hidet_input_ids_list, dtype=hidet.int32, device='cuda')
hidet_position_ids = hidet.arange(hidet_input_ids.shape[0], dtype=hidet.int32, device='cuda')
hidet_past_keys = hidet.zeros([gpt2_module.num_hidden_layers, gpt2_module.num_heads, 0, gpt2_module.head_dim], dtype=hidet.float32, device='cuda')
hidet_past_values = hidet.zeros([gpt2_module.num_hidden_layers, gpt2_module.num_heads, 0, gpt2_module.head_dim], dtype=hidet.float32, device='cuda')

hidet_latency = hidet.utils.benchmark_func(
    lambda: generate_hidet(
        compiled_model,
        text,
        hidet_input_ids,
        hidet_position_ids,
        hidet.zeros([gpt2_module.num_hidden_layers, gpt2_module.num_heads, 0, gpt2_module.head_dim], dtype=hidet.float32, device='cuda'),
        hidet.zeros([gpt2_module.num_hidden_layers, gpt2_module.num_heads, 0, gpt2_module.head_dim], dtype=hidet.float32, device='cuda'),
        'cuda',
        tokens_to_generate=40
    ),
    repeat=1
)