sparklyr / sparklyr

R interface for Apache Spark

Home Page:https://spark.rstudio.com/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

compute doesn't work in sparklyr 1.8.2

joscani opened this issue · comments

Hi. I found a possible problem wih sparklyr 1.8.2 in spark 2.4.5 . I write down my code

library(tidyverse)
library(sparklyr)


my_init_spark <- function(executor_instances = 10,
                       executor_cores = 5,
                       executor_memory = "10G", 
                       driver_memory = "10G", 
                       app_name = "churn-b2c") {
  require(sparklyr)
  require(tidyverse)
  
  conf <- sparklyr::spark_config()
  conf$spark.sql.catalogImplementation <- "hive"
  conf$spark.dynamicAllocation.enabled <- "true"
  conf$spark.executor.instances <- executor_instances
  conf$spark.dynamicAllocation.minExecutors <- 1
  conf$spark.dynamicAllocation.maxExecutors <- 40
  
  
  conf$spark.executor.cores <- executor_cores
  conf$spark.executor.memory <- executor_memory
  conf$spark.driver.memory <- driver_memory
  conf$spark.memory.fraction <- 0.95
  
  sc <- spark_connect(
    master = "yarn",
    version = "2.4.5",
    config = conf,
    app_name = app_name
  )
  return(sc)
}

sc <- my_init_spark(app_name = "bigb2c-1109", executor_instances = 10,
                 executor_cores = 6,  executor_memory = "20G" )

iris_sp <- sdf_copy_to(sc, iris)

iris_cache <- iris_sp %>% compute()

The error

Error in `db_save_query.DBIConnection()`:
! Can't save query to "dbplyr_001".
Caused by error:
! org.apache.spark.sql.catalyst.parser.ParseException: 
no viable alternative at input 'CREATE OR REPLACE TEMPORARY VIEW \n"dbplyr_001"'(line 2, pos 0)

== SQL ==
CREATE OR REPLACE TEMPORARY VIEW 
"dbplyr_001" AS SELECT *
^^^
FROM `iris`

	at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:241)
	at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:117)
	at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:48)
	at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:69)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:643)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at sparklyr.Invoke.invoke(invoke.scala:161)
	at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
	at sparklyr.StreamHandler.read(stream.scala:62)
	at sparklyr.BackendHandler$$anonfun$channelRead0$1.apply$mcV$sp(handler.scala:60)
	at scala.util.control.Breaks.breakable(Breaks.scala:38)
	at sparklyr.BackendHandler.channelRead0(handler.scala:40)
	at sparklyr.BackendHandler.channelRead0(handler.scala:14)
	at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
	at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
	at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:328)
	at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:302)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1422)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
	at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:931)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:700)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:635)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:552)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:514)
	at io.netty.util.concurrent.SingleThreadEventExecutor$6.run(SingleThreadEventExecutor.java:1044)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	at java.lang.Thread.run(Thread.java:750)

Same code works in old sparklyr versions over the same spark cluster.

Any idea? Thanks a lot

Even, in 1.8.1. this works

# with sparklyr 1.8.1 and spark 2.4.8
library(sparklyr)
library(tidyverse)
sc <- spark_connect(master="local")
iris_sp <- sdf_copy_to(sc, iris)
iris_cache <- iris_sp %>% compute()
iris_cache
# Source: spark<dbplyr_002> [?? x 5]
   Sepal_Length Sepal_Width Petal_Length Petal_Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
 1          5.1         3.5          1.4         0.2 setosa 
 2          4.9         3            1.4         0.2 setosa 
 3          4.7         3.2          1.3         0.2 setosa 
 4          4.6         3.1          1.5         0.2 setosa 
 5          5           3.6          1.4         0.2 setosa 
 6          5.4         3.9          1.7         0.4 setosa 
 7          4.6         3.4          1.4         0.3 setosa 
 8          5           3.4          1.5         0.2 setosa 
 9          4.4         2.9          1.4         0.2 setosa 
10          4.9         3.1          1.5         0.1 setosa 

the same in sparklyr 1.8.2

> iris_cache <- iris_sp %>% compute()
Error in `db_save_query.DBIConnection()`:
! Can't save query to "dbplyr_001".
Caused by error:
! org.apache.spark.sql.catalyst.parser.ParseException: 
no viable alternative at input 'CREATE OR REPLACE TEMPORARY VIEW \n"dbplyr_001"'(line 2, pos 0)

Minimum reprex on 1.8.2 using a "local" Spark session:

suppressPackageStartupMessages(library(sparklyr))
packageVersion("sparklyr")
#> [1] '1.8.2'
suppressPackageStartupMessages(library(dplyr))
sc <- spark_connect("local")
tbl_mtcars <- copy_to(sc, mtcars)
tbl_mtcars %>% count(am) %>% compute()
#> Error in `db_save_query.DBIConnection()`:
#> ! Can't save query to "dbplyr_001".
#> Caused by error:
#> ! org.apache.spark.sql.catalyst.parser.ParseException: 
#> no viable alternative at input 'CREATE OR REPLACE TEMPORARY VIEW \n"dbplyr_001"'(line 2, pos 0)
#> 
#> == SQL ==
#> CREATE OR REPLACE TEMPORARY VIEW 
#> "dbplyr_001" AS SELECT `am`, COUNT(*) AS `n`
#> ^^^
#> FROM `mtcars`
#> GROUP BY `am`
#> 
#>  at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:266)
#>  at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:133)
#>  at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:48)
#>  at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:81)
#>  at org.apache.spark.sql.SparkSession.$anonfun$sql$2(SparkSession.scala:604)
#>  at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
#>  at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:604)
#>  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
#>  at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:601)
#>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
#>  at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
#>  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
#>  at java.lang.reflect.Method.invoke(Method.java:498)
#>  at sparklyr.Invoke.invoke(invoke.scala:161)
#>  at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
#>  at sparklyr.StreamHandler.read(stream.scala:62)
#>  at sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
#>  at scala.util.control.Breaks.breakable(Breaks.scala:42)
#>  at sparklyr.BackendHandler.channelRead0(handler.scala:41)
#>  at sparklyr.BackendHandler.channelRead0(handler.scala:14)
#>  at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
#>  at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
#>  at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
#>  at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
#>  at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:321)
#>  at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:295)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
#>  at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
#>  at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
#>  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
#>  at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
#>  at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163)
#>  at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714)
#>  at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650)
#>  at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576)
#>  at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493)
#>  at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989)
#>  at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
#>  at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
#>  at java.lang.Thread.run(Thread.java:750)
#> Backtrace:
#>      ▆
#>   1. ├─tbl_mtcars %>% count(am) %>% compute()
#>   2. ├─dplyr::compute(.)
#>   3. ├─sparklyr:::compute.tbl_spark(.)
#>   4. ├─base::NextMethod()
#>   5. └─dbplyr:::compute.tbl_sql(.)
#>   6.   ├─dbplyr::db_compute(...)
#>   7.   └─dbplyr:::db_compute.DBIConnection(...)
#>   8.     └─dbplyr:::dbplyr_save_query(con, sql, table, temporary = temporary)
#>   9.       └─dbplyr:::dbplyr_fallback(con, "db_save_query", ...)
#>  10.         ├─rlang::eval_bare(expr((!!fun)(con, ...)))
#>  11.         └─dbplyr:::db_save_query.DBIConnection(con, ...)
#>  12.           └─base::tryCatch(...)
#>  13.             └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#>  14.               └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#>  15.                 └─value[[3L]](cond)
#>  16.                   └─cli::cli_abort("Can't save query to {.val {name}}.", parent = cnd)
#>  17.                     └─rlang::abort(...)
DBI::dbListTables(sc)
#> [1] "mtcars"
spark_disconnect(sc)

Created on 2023-08-04 with reprex v2.0.2

But it does work with the dev version

suppressPackageStartupMessages(library(sparklyr))
packageVersion("sparklyr")
#> [1] '1.8.2.9000'
suppressPackageStartupMessages(library(dplyr))
sc <- spark_connect("local")
tbl_mtcars <- copy_to(sc, mtcars)
tbl_mtcars %>% count(am) %>% compute()
#> # Source: spark<dbplyr_001> [?? x 2]
#>      am     n
#>   <dbl> <dbl>
#> 1     1    13
#> 2     0    19
DBI::dbListTables(sc)
#> [1] "dbplyr_001" "mtcars"
spark_disconnect(sc)

Created on 2023-08-04 with reprex v2.0.2

@joscani - would you mind installing the DEV version and trying it again? I was able to confirm that it works with a "local" Spark session, but I see that you need it to work in YARN, so it would be good to make sure

remotes::install_github("sparklyr/sparklyr")

Thanks @edgararuiz .
I'll try next Monday .

It works. Thanks a lot @edgararuiz