스칼라 설치
sudo apt install scala
sbt 설치
http://www.scala-sbt.org/download.html
spark 다운로드 및 압축 해제
http://d3kbcqa49mib13.cloudfront.net/spark-2.0.0-bin-hadoop2.7.tgz
hadoop 다운로드 및 압축 해제
http://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
.bashrc에 추가
# JAVA_HOME, HADOOP_HOME, SPARK_HOME, export JAVA_HOME=/usr/lib/jvm/java-8-oracle export HADOOP_HOME=/usr/local/src/hadoop-2.7.1 export PATH=$HADOOP_HOME/bin:$PATH export SPARK_DIST_CLASSPATH=$(hadoop classpath) export SPARK_HOME=/usr/local/src/spark-2.0.0-bin-hadoop2.7 export PATH=$SPARK_HOME/bin:$PATH export SPARK_LOCAL_IP={ip address} export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native/:$LD_LIBRARY_PATH
테스트
./bin/example SparkPi 10
https://spark-packages.org/package/mongodb/mongo-spark
다운로드 및 빌드
https://github.com/mongodb/mongo-spark
실행
pyspark --packages org.mongodb.spark:mongo-spark-connector_2.11:1.0.0
.bashrc에 추가
export PYSPARK_DRIVER_PYTHON=ipython #export PYSPARK_DRIVER_PYTHON_OPTS=qtconsole
.bashrc에 추가
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.1-src.zip:$PYTHONPATH
spyder.py 파일 생성
cp /usr/bin/spyder /usr/bin/spyder.py
spark로 spyder 시작
spark-submit /usr/bin/spyder.py
spark 세션 시작
from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("PythonSQL")\ .config("spark.some.config.option", "some-value")\ .getOrCreate()
Linux
PYSPARK_DRIVER_PYTHON_OPTS=notebook pyspark
Windows
set PYSPARK_DRIVER_PYTHON_OPTS=notebook pyspark
프로필 생성
ipython profile create pyspark
~/.ipython/profile_pyspark/startup/00-pyspark-setup.py 생성
# Configure the necessary Spark environment import os import sys spark_home = os.environ.get('SPARK_HOME', None) sys.path.insert(0, spark_home + "/python") # Add the py4j to the path. # You may need to change the version number to match your install sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip')) # Initialize PySpark to predefine the SparkContext variable 'sc' # execfile(os.path.join(spark_home, 'python/pyspark/shell.py')) # py27 exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read()) # py35
실행
ipython notebook --profile=pyspark # jupyter notebook --profile=pyspark
클러스터 시작 및 종료
$SPARK_HOME/sbin/start-all.sh $SPARK_HOME/sbin/stop-all.sh
클러스터에 접속
pyspark --master spark://{master ip}:7077