No support for splitting of the driver into offloaded and not-offloaded part.
piotrows opened this issue · comments
Zbigniew Piotrowski commented
It is impossible to compute part of the driver on the device and part of the host, since the acc exit data is always placed at the end of the file. Thus, it is not accessible to potential diagnostics or other operations that by design need to be performed on the hoist. The possible solution seem to be to introduce the Loki directive that would order AnnotateTransformation to finalise the offloaded program region.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
import pytest
from loki import (
Subroutine, Dimension, fgen, Sourcefile, SubroutineItem
)
from transformations import (
SCCDevectorTransformation, SCCHoistTransformation,
SCCDemoteTransformation, SCCAnnotateTransformation,
)
from conftest import available_frontends
#pylint: disable=too-many-lines
@pytest.fixture(scope='module', name='horizontal')
def fixture_horizontal():
return Dimension(name='horizontal', size='nlon', index='jl', bounds=('start', 'end'))
@pytest.fixture(scope='module', name='vertical')
def fixture_vertical():
return Dimension(name='vertical', size='nz', index='jk')
@pytest.fixture(scope='module', name='blocking')
def fixture_blocking():
return Dimension(name='blocking', size='nb', index='b')
@pytest.mark.parametrize('frontend', available_frontends())
def test_scc_hoist_multiple_kernels(frontend, horizontal, vertical, blocking):
"""
Test hoisting of column temporaries to "driver" level.
"""
fcode_driver = """
SUBROUTINE column_driver(nlon, nz, q, nb)
INTEGER, INTENT(IN) :: nlon, nz, nb ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz,nb)
REAL :: talt(nlon,nz)
INTEGER :: b, start, end
start = 1
end = nlon
do b=1, nb
call compute_column(start, end, nlon, nz, q(:,:,b))
end do
call compute_error(start, end, nlon, nz, q(:,:,1))
END SUBROUTINE column_driver
"""
fcode_kernel2 = """
SUBROUTINE compute_error(start, end, nlon, nz, q)
INTEGER, INTENT(IN) :: start, end ! Iteration indices
INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz)
INTEGER :: jl, jk
REAL :: c,d
d = 0
DO jk = 1, nz
c = SUM(q(1:nlon,jk))
d = d+c
END DO
END SUBROUTINE compute_error
"""
fcode_kernel = """
SUBROUTINE compute_column(start, end, nlon, nz, q)
INTEGER, INTENT(IN) :: start, end ! Iteration indices
INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz)
REAL :: t(nlon,nz)
INTEGER :: jl, jk
REAL :: c
c = 5.345
DO jk = 2, nz
DO jl = start, end
t(jl, jk) = c * k
q(jl, jk) = q(jl, jk-1) + t(jl, jk) * c
END DO
END DO
! The scaling is purposefully upper-cased
DO JL = START, END
Q(JL, NZ) = Q(JL, NZ) * C
END DO
END SUBROUTINE compute_column
"""
driver_item = SubroutineItem(name='#column_driver',
source=Sourcefile.from_source(fcode_driver, frontend=frontend))
driver0_item = SubroutineItem(name='#column_driver',
source=Sourcefile.from_source(fcode_driver, frontend=frontend))
kernel = Subroutine.from_source(fcode_kernel, frontend=frontend)
kernel2 = Subroutine.from_source(fcode_kernel2, frontend=frontend)
driver = Subroutine.from_source(fcode_driver, frontend=frontend)
driver.enrich_calls(kernel) # Attach kernel source to driver call
driver.enrich_calls(kernel2) # Attach kernel source to driver call
driver0 = Subroutine.from_source(fcode_driver, frontend=frontend)
driver0.enrich_calls(kernel) # Attach kernel source to driver call
scc_transform = (SCCDevectorTransformation(horizontal=horizontal),)
scc_transform += (SCCDemoteTransformation(horizontal=horizontal),)
scc_transform += (SCCHoistTransformation(horizontal=horizontal, vertical=vertical,
block_dim=blocking),)
scc_transform += (SCCAnnotateTransformation(horizontal=horizontal, vertical=vertical,
directive='openacc',
block_dim=blocking, hoist_column_arrays=True),)
for transform in scc_transform:
transform.apply(driver0, role='driver',
targets=['compute_column'],
item=driver0_item)
transform.apply(kernel, role='kernel')
print("*******First scenario: \"diagnostic\" kernel is not processed by Loki")
print(fgen(driver0))
print("*******Exit data placed after \"diagnostic\" kernel call, so it can't access data while being executed on host")
print("*******Second scenario: \"diagnostic\" kernel is processed by Loki, but lacks horizontal loop by design. Transformation unsuccessful")
for transform in scc_transform:
transform.apply(driver, role='driver',
targets=['compute_column','compute_error'],
item=driver_item)
transform.apply(kernel, role='kernel')
transform.apply(kernel2, role='kernel')
assert False