workload_functions.sh 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. #!/bin/bash
  2. # Licensed to the Apache Software Foundation (ASF) under one or more
  3. # contributor license agreements. See the NOTICE file distributed with
  4. # this work for additional information regarding copyright ownership.
  5. # The ASF licenses this file to You under the Apache License, Version 2.0
  6. # (the "License"); you may not use this file except in compliance with
  7. # the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. set -u
  17. export HIBENCH_PRINTFULLLOG=0
  18. this="${BASH_SOURCE-$0}"
  19. workload_func_bin=$(cd -P -- "$(dirname -- "$this")" && pwd -P)
  20. . ${workload_func_bin}/assert.sh
  21. . ${workload_func_bin}/color.sh
  22. HIBENCH_CONF_FOLDER=${HIBENCH_CONF_FOLDER:-${workload_func_bin}/../../conf}
  23. function enter_bench(){ # declare the entrance of a workload
  24. assert $1 "Workload name not specified."
  25. assert $2 "Workload config file not specified."
  26. assert $3 "Current workload folder not specified."
  27. export HIBENCH_CUR_WORKLOAD_NAME=$1
  28. workload_config_file=$2
  29. workload_folder=$3
  30. shift 3
  31. patching_args=$@
  32. echo "patching args=$patching_args"
  33. local CONF_FILE=`${workload_func_bin}/load_config.py ${HIBENCH_CONF_FOLDER} $workload_config_file $workload_folder $patching_args`
  34. . $CONF_FILE
  35. }
  36. function leave_bench(){ # declare the workload is finished
  37. assert $HIBENCH_CUR_WORKLOAD_NAME "BUG, HIBENCH_CUR_WORKLOAD_NAME unset."
  38. unset HIBENCH_CUR_WORKLOAD_NAME
  39. }
  40. function show_bannar(){ # print bannar
  41. assert $HIBENCH_CUR_WORKLOAD_NAME "HIBENCH_CUR_WORKLOAD_NAME not specified."
  42. assert $1 "Unknown banner operation"
  43. echo -e "${BGreen}$1 ${Color_Off}${UGreen}$HIBENCH_CUR_WORKLOAD_NAME${Color_Off} ${BGreen}bench${Color_Off}"
  44. }
  45. function timestamp(){ # get current timestamp
  46. sec=`date +%s`
  47. nanosec=`date +%N`
  48. re='^[0-9]+$'
  49. if ! [[ $nanosec =~ $re ]] ; then
  50. $nanosec=0
  51. fi
  52. tmp=`expr $sec \* 1000 `
  53. msec=`expr $nanosec / 1000000 `
  54. echo `expr $tmp + $msec`
  55. }
  56. function start_monitor(){
  57. MONITOR_PID=`${workload_func_bin}/monitor.py ${HIBENCH_CUR_WORKLOAD_NAME} $$ ${WORKLOAD_RESULT_FOLDER}/monitor.log ${WORKLOAD_RESULT_FOLDER}/bench.log ${WORKLOAD_RESULT_FOLDER}/monitor.html ${SLAVES} &`
  58. # echo "start monitor, got child pid:${MONITOR_PID}" > /dev/stderr
  59. echo ${MONITOR_PID}
  60. }
  61. function stop_monitor(){
  62. MONITOR_PID=$1
  63. assert $1 "monitor pid missing"
  64. # echo "stop monitor, kill ${MONITOR_PID}" > /dev/stderr
  65. kill ${MONITOR_PID}
  66. }
  67. function get_field_name() { # print report column header
  68. printf "${REPORT_COLUMN_FORMATS}" Type Date Time Input_data_size "Duration(s)" "Throughput(bytes/s)" Throughput/node
  69. }
  70. function gen_report() { # dump the result to report file
  71. assert ${HIBENCH_CUR_WORKLOAD_NAME} "HIBENCH_CUR_WORKLOAD_NAME not specified."
  72. local start=$1
  73. local end=$2
  74. local size=$3
  75. which bc > /dev/null 2>&1
  76. if [ $? -eq 1 ]; then
  77. assert 0 "\"bc\" utility missing. Please install it to generate proper report."
  78. return 1
  79. fi
  80. local duration=$(echo "scale=3;($end-$start)/1000"|bc)
  81. local tput=`echo "$size/$duration"|bc`
  82. # local nodes=`cat ${SPARK_HOME}/conf/slaves 2>/dev/null | grep -v '^\s*$' | sed "/^#/ d" | wc -l`
  83. local nodes=`echo ${SLAVES} | wc -w`
  84. nodes=${nodes:-1}
  85. if [ $nodes -eq 0 ]; then nodes=1; fi
  86. local tput_node=`echo "$tput/$nodes"|bc`
  87. REPORT_TITLE=`get_field_name`
  88. if [ ! -f ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME} ] ; then
  89. echo "${REPORT_TITLE}" > ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME}
  90. fi
  91. REPORT_LINE=$(printf "${REPORT_COLUMN_FORMATS}" ${HIBENCH_CUR_WORKLOAD_NAME} $(date +%F) $(date +%T) $size $duration $tput $tput_node)
  92. echo "${REPORT_LINE}" >> ${HIBENCH_REPORT}/${HIBENCH_REPORT_NAME}
  93. echo "# ${REPORT_TITLE}" >> ${HIBENCH_WORKLOAD_CONF}
  94. echo "# ${REPORT_LINE}" >> ${HIBENCH_WORKLOAD_CONF}
  95. }
  96. function rmr_hdfs(){ # rm -r for hdfs
  97. assert $1 "dir parameter missing"
  98. RMDIR_CMD="fs -rm -r -skipTrash"
  99. local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR $RMDIR_CMD $1"
  100. echo -e "${BCyan}hdfs rm -r: ${Cyan}${CMD}${Color_Off}" 1>&2
  101. execute_withlog ${CMD}
  102. }
  103. function upload_to_hdfs(){
  104. assert $1 "local parameter missing"
  105. assert $2 "remote parameter missing"
  106. LOCAL_FILE_PATH=$1
  107. REMOTE_FILE_PATH=$2
  108. echo "REMOTE_FILE_PATH:$REMOTE_FILE_PATH" 1>&2
  109. if [[ `echo $REMOTE_FILE_PATH | tr A-Z a-z` = hdfs://* ]]; then # strip leading "HDFS://xxx:xxx/" string
  110. echo "HDFS_MASTER:$HDFS_MASTER" 1>&2
  111. local LEADING_HDFS_STRING_LENGTH=${#HDFS_MASTER}
  112. REMOTE_FILE_PATH=${REMOTE_FILE_PATH:$LEADING_HDFS_STRING_LENGTH}
  113. echo "stripped REMOTE_FILE_PATH:$REMOTE_FILE_PATH" 1>&2
  114. fi
  115. # clear previous package file
  116. local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -rm $REMOTE_FILE_PATH"
  117. echo -e "${BCyan}hdfs rm : ${Cyan}${CMD}${Color_Off}" 1>&2
  118. execute_withlog ${CMD}
  119. # prepare parent folder
  120. CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -mkdir `dirname $REMOTE_FILE_PATH`"
  121. echo -e "${BCyan}hdfs mkdir : ${Cyan}${CMD}${Color_Off}" 1>&2
  122. execute_withlog ${CMD}
  123. # upload
  124. CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR fs -put $LOCAL_FILE_PATH $REMOTE_FILE_PATH"
  125. echo -e "${BCyan}hdfs put : ${Cyan}${CMD}${Color_Off}" 1>&2
  126. execute_withlog ${CMD}
  127. }
  128. function dus_hdfs(){ # du -s for hdfs
  129. assert $1 "dir parameter missing"
  130. DUS_CMD="fs -du -s"
  131. local CMD="$HADOOP_EXECUTABLE --config $HADOOP_CONF_DIR $DUS_CMD $1"
  132. echo -e "${BPurple}hdfs du -s: ${Purple}${CMD}${Color_Off}" 1>&2
  133. execute_withlog ${CMD}
  134. }
  135. function check_dir() { # ensure dir is created
  136. local dir=$1
  137. assert $1 "dir parameter missing"
  138. if [ -z "$dir" ];then
  139. echo -e "${BYellow}WARN${Color_Off}: payload missing."
  140. return 1
  141. fi
  142. if [ ! -d "$dir" ];then
  143. echo -e "${BRed}ERROR${Color_Off}: directory $dir does not exist."
  144. exit 1
  145. fi
  146. touch "$dir"/touchtest
  147. if [ $? -ne 0 ]; then
  148. echo -e "${BRed}ERROR${Color_Off}: directory unwritable."
  149. exit 1
  150. else
  151. rm "$dir"/touchtest
  152. fi
  153. }
  154. function dir_size() {
  155. for item in $(dus_hdfs $1); do
  156. if [[ $item =~ ^[0-9]+$ ]]; then
  157. echo $item
  158. fi
  159. done
  160. }
  161. function run_spark_job() {
  162. LIB_JARS=
  163. while (($#)); do
  164. if [ "$1" = "--jars" ]; then
  165. LIB_JARS="--jars $2"
  166. shift 2
  167. continue
  168. fi
  169. break
  170. done
  171. CLS=$1
  172. shift
  173. export_withlog SPARKBENCH_PROPERTIES_FILES
  174. YARN_OPTS=""
  175. if [[ "$SPARK_MASTER" == yarn-* ]] || [[ "$SPARK_MASTER" == yarn ]]; then
  176. export_withlog HADOOP_CONF_DIR
  177. YARN_OPTS="--num-executors ${YARN_NUM_EXECUTORS}"
  178. if [[ -n "${YARN_EXECUTOR_CORES:-}" ]]; then
  179. YARN_OPTS="${YARN_OPTS} --executor-cores ${YARN_EXECUTOR_CORES}"
  180. fi
  181. if [[ -n "${SPARK_YARN_EXECUTOR_MEMORY:-}" ]]; then
  182. YARN_OPTS="${YARN_OPTS} --executor-memory ${SPARK_YARN_EXECUTOR_MEMORY}"
  183. fi
  184. if [[ -n "${SPAKR_YARN_DRIVER_MEMORY:-}" ]]; then
  185. YARN_OPTS="${YARN_OPTS} --driver-memory ${SPARK_YARN_DRIVER_MEMORY}"
  186. fi
  187. fi
  188. if [[ "$CLS" == *.py ]]; then
  189. LIB_JARS="$LIB_JARS --jars ${SPARKBENCH_JAR}"
  190. SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --master ${SPARK_MASTER} ${YARN_OPTS} ${CLS} $@"
  191. else
  192. SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --class ${CLS} --master ${SPARK_MASTER} --conf spark.kubernetes.container.image=pl4tinum/hibench-kube:spark-3.0-hadoop-2.7-scala-2.12 --conf spark.kubernetes.authenticate.driver.serviceAccountName=default --conf spark.kubernetes.container.image.pullPolicy=Always --conf spark.kubernetes.namespace=iccs-hibench --conf spark.driver.host=hibench-master-nfs-svc --conf spark.driver.port=5000 --conf spark.kubernetes.executor.podTemplateFile=/executor.yaml ${YARN_OPTS} ${SPARKBENCH_JAR} $@"
  193. fi
  194. echo -e "${BGreen}Submit Spark job: ${Green}${SUBMIT_CMD}${Color_Off}"
  195. MONITOR_PID=`start_monitor`
  196. execute_withlog ${SUBMIT_CMD}
  197. result=$?
  198. stop_monitor ${MONITOR_PID}
  199. if [ $result -ne 0 ]
  200. then
  201. echo -e "${BRed}ERROR${Color_Off}: Spark job ${BYellow}${CLS}${Color_Off} failed to run successfully."
  202. echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n"
  203. tail ${WORKLOAD_RESULT_FOLDER}/bench.log
  204. exit $result
  205. fi
  206. }
  207. function run_storm_job(){
  208. CMD="${STORM_HOME}/bin/storm jar ${STREAMBENCH_STORM_JAR} $@"
  209. echo -e "${BGreen}Submit Storm Job: ${Green}$CMD${Color_Off}"
  210. execute_withlog $CMD
  211. }
  212. function run_gearpump_app(){
  213. CMD="${GEARPUMP_HOME}/bin/gear app -executors ${STREAMBENCH_GEARPUMP_EXECUTORS} -jar ${STREAMBENCH_GEARPUMP_JAR} $@"
  214. echo -e "${BGreen}Submit Gearpump Application: ${Green}$CMD${Color_Off}"
  215. execute_withlog $CMD
  216. }
  217. function run_flink_job(){
  218. CMD="${FLINK_HOME}/bin/flink run -p ${STREAMBENCH_FLINK_PARALLELISM} -m ${HIBENCH_FLINK_MASTER} $@ ${STREAMBENCH_FLINK_JAR} ${SPARKBENCH_PROPERTIES_FILES}"
  219. echo -e "${BGreen}Submit Flink Job: ${Green}$CMD${Color_Off}"
  220. execute_withlog $CMD
  221. }
  222. function run_hadoop_job(){
  223. ENABLE_MONITOR=1
  224. if [ "$1" = "--without-monitor" ]; then
  225. ENABLE_MONITOR=0
  226. shift 1
  227. fi
  228. local job_jar=$1
  229. shift
  230. local job_name=$1
  231. shift
  232. local tail_arguments=$@
  233. local CMD="${HADOOP_EXECUTABLE} --config ${HADOOP_CONF_DIR} jar $job_jar $job_name $tail_arguments"
  234. echo -e "${BGreen}Submit MapReduce Job: ${Green}$CMD${Color_Off}"
  235. if [ ${ENABLE_MONITOR} = 1 ]; then
  236. MONITOR_PID=`start_monitor`
  237. fi
  238. execute_withlog ${CMD}
  239. result=$?
  240. if [ ${ENABLE_MONITOR} = 1 ]; then
  241. stop_monitor ${MONITOR_PID}
  242. fi
  243. if [ $result -ne 0 ]; then
  244. echo -e "${BRed}ERROR${Color_Off}: Hadoop job ${BYellow}${job_jar} ${job_name}${Color_Off} failed to run successfully."
  245. echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n"
  246. tail ${WORKLOAD_RESULT_FOLDER}/bench.log
  247. exit $result
  248. fi
  249. }
  250. function ensure_hivebench_release(){
  251. if [ ! -e ${HIBENCH_HOME}"/hadoopbench/sql/target/"$HIVE_RELEASE".tar.gz" ]; then
  252. assert 0 "Error: The hive bin file hasn't be downloaded by maven, please check!"
  253. exit
  254. fi
  255. cd ${HIBENCH_HOME}"/hadoopbench/sql/target"
  256. if [ ! -d $HIVE_HOME ]; then
  257. tar zxf $HIVE_RELEASE".tar.gz"
  258. fi
  259. export_withlog HADOOP_EXECUTABLE
  260. }
  261. function ensure_mahout_release (){
  262. if [ ! -e ${HIBENCH_HOME}"/hadoopbench/mahout/target/"$MAHOUT_RELEASE".tar.gz" ]; then
  263. assert 0 "Error: The mahout bin file hasn't be downloaded by maven, please check!"
  264. exit
  265. fi
  266. cd ${HIBENCH_HOME}"/hadoopbench/mahout/target"
  267. if [ ! -d $MAHOUT_HOME ]; then
  268. tar zxf $MAHOUT_RELEASE".tar.gz"
  269. fi
  270. export_withlog HADOOP_EXECUTABLE
  271. export_withlog HADOOP_HOME
  272. export_withlog HADOOP_CONF_DIR
  273. }
  274. function execute () {
  275. CMD="$@"
  276. echo -e "${BCyan}Executing: ${Cyan}${CMD}${Color_Off}"
  277. $CMD
  278. }
  279. function printFullLog(){
  280. export HIBENCH_PRINTFULLLOG=1
  281. }
  282. function execute_withlog () {
  283. CMD="$@"
  284. if [ -t 1 ] ; then # Terminal, beautify the output.
  285. ${workload_func_bin}/execute_with_log.py ${WORKLOAD_RESULT_FOLDER}/bench.log $CMD
  286. else # pipe, do nothing.
  287. $CMD
  288. fi
  289. }
  290. function export_withlog () {
  291. var_name=$1
  292. var_val=${!1}
  293. assert $1 "export without a variable name!"
  294. echo -e "${BCyan}Export env: ${Cyan}${var_name}${BCyan}=${Cyan}${var_val}${Color_Off}"
  295. export ${var_name}
  296. }
  297. function command_exist ()
  298. {
  299. result=$(which $1)
  300. if [ $? -eq 0 ]
  301. then
  302. return 0
  303. else
  304. return 1
  305. fi
  306. }
  307. function ensure_nutchindexing_release () {
  308. if [ ! -e ${HIBENCH_HOME}"/hadoopbench/nutchindexing/target/apache-nutch-1.2-bin.tar.gz" ]; then
  309. assert 0 "Error: The nutch bin file hasn't be downloaded by maven, please check!"
  310. exit
  311. fi
  312. NUTCH_ROOT=${WORKLOAD_RESULT_FOLDER}
  313. cp -a $NUTCH_DIR/nutch $NUTCH_ROOT
  314. cd ${HIBENCH_HOME}"/hadoopbench/nutchindexing/target"
  315. if [ ! -d $NUTCH_HOME ]; then
  316. tar zxf apache-nutch-1.2-bin.tar.gz
  317. fi
  318. find $NUTCH_HOME/lib ! -name "lucene-*" -type f -exec rm -rf {} \;
  319. rm -rf $NUTCH_ROOT/nutch_release
  320. cp -a $NUTCH_HOME $NUTCH_ROOT/nutch_release
  321. NUTCH_HOME_WORKLOAD=$NUTCH_ROOT/nutch_release
  322. cp $NUTCH_ROOT/nutch/conf/nutch-site.xml $NUTCH_HOME_WORKLOAD/conf
  323. cp $NUTCH_ROOT/nutch/bin/nutch $NUTCH_HOME_WORKLOAD/bin
  324. # Patching jcl-over-slf4j version against cdh or hadoop2
  325. mkdir $NUTCH_HOME_WORKLOAD/temp
  326. unzip -q $NUTCH_HOME_WORKLOAD/nutch-1.2.job -d $NUTCH_HOME_WORKLOAD/temp
  327. rm -f $NUTCH_HOME_WORKLOAD/temp/lib/jcl-over-slf4j-*.jar
  328. rm -f $NUTCH_HOME_WORKLOAD/temp/lib/slf4j-log4j*.jar
  329. cp ${NUTCH_DIR}/target/dependency/jcl-over-slf4j-*.jar $NUTCH_HOME_WORKLOAD/temp/lib
  330. rm -f $NUTCH_HOME_WORKLOAD/nutch-1.2.job
  331. cd $NUTCH_HOME_WORKLOAD/temp
  332. zip -qr $NUTCH_HOME_WORKLOAD/nutch-1.2.job *
  333. rm -rf $NUTCH_HOME_WORKLOAD/temp
  334. echo $NUTCH_HOME_WORKLOAD
  335. }
  336. function prepare_sql_aggregation () {
  337. assert $1 "SQL file path not exist"
  338. HIVEBENCH_SQL_FILE=$1
  339. find . -name "metastore_db" -exec rm -rf "{}" \; 2>/dev/null
  340. cat <<EOF > ${HIVEBENCH_SQL_FILE}
  341. USE DEFAULT;
  342. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  343. set ${MAP_CONFIG_NAME}=$NUM_MAPS;
  344. set ${REDUCER_CONFIG_NAME}=$NUM_REDS;
  345. set hive.stats.autogather=false;
  346. DROP TABLE IF EXISTS uservisits;
  347. CREATE EXTERNAL TABLE uservisits (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits';
  348. DROP TABLE IF EXISTS uservisits_aggre;
  349. CREATE EXTERNAL TABLE uservisits_aggre ( sourceIP STRING, sumAdRevenue DOUBLE) STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/uservisits_aggre';
  350. INSERT OVERWRITE TABLE uservisits_aggre SELECT sourceIP, SUM(adRevenue) FROM uservisits GROUP BY sourceIP;
  351. EOF
  352. }
  353. function prepare_sql_join () {
  354. assert $1 "SQL file path not exist"
  355. HIVEBENCH_SQL_FILE=$1
  356. find . -name "metastore_db" -exec rm -rf "{}" \; 2>/dev/null
  357. cat <<EOF > ${HIVEBENCH_SQL_FILE}
  358. USE DEFAULT;
  359. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  360. set ${MAP_CONFIG_NAME}=$NUM_MAPS;
  361. set ${REDUCER_CONFIG_NAME}=$NUM_REDS;
  362. set hive.stats.autogather=false;
  363. DROP TABLE IF EXISTS rankings;
  364. CREATE EXTERNAL TABLE rankings (pageURL STRING, pageRank INT, avgDuration INT) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/rankings';
  365. DROP TABLE IF EXISTS uservisits_copy;
  366. CREATE EXTERNAL TABLE uservisits_copy (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits';
  367. DROP TABLE IF EXISTS rankings_uservisits_join;
  368. CREATE EXTERNAL TABLE rankings_uservisits_join ( sourceIP STRING, avgPageRank DOUBLE, totalRevenue DOUBLE) STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/rankings_uservisits_join';
  369. INSERT OVERWRITE TABLE rankings_uservisits_join SELECT sourceIP, avg(pageRank), sum(adRevenue) as totalRevenue FROM rankings R JOIN (SELECT sourceIP, destURL, adRevenue FROM uservisits_copy UV WHERE (datediff(UV.visitDate, '1999-01-01')>=0 AND datediff(UV.visitDate, '2000-01-01')<=0)) NUV ON (R.pageURL = NUV.destURL) group by sourceIP order by totalRevenue DESC;
  370. EOF
  371. }
  372. function prepare_sql_scan () {
  373. assert $1 "SQL file path not exist"
  374. HIVEBENCH_SQL_FILE=$1
  375. find . -name "metastore_db" -exec rm -rf "{}" \; 2>/dev/null
  376. cat <<EOF > ${HIVEBENCH_SQL_FILE}
  377. USE DEFAULT;
  378. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  379. set ${MAP_CONFIG_NAME}=$NUM_MAPS;
  380. set ${REDUCER_CONFIG_NAME}=$NUM_REDS;
  381. set hive.stats.autogather=false;
  382. DROP TABLE IF EXISTS uservisits;
  383. CREATE EXTERNAL TABLE uservisits (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$INPUT_HDFS/uservisits';
  384. DROP TABLE IF EXISTS uservisits_copy;
  385. CREATE EXTERNAL TABLE uservisits_copy (sourceIP STRING,destURL STRING,visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING,languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS SEQUENCEFILE LOCATION '$OUTPUT_HDFS/uservisits_copy';
  386. INSERT OVERWRITE TABLE uservisits_copy SELECT * FROM uservisits;
  387. EOF
  388. }