[Linux] Install_pyspark

How to install Pyspark

  • prerequisite: python3

    Code

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    # install java
    sudo add-apt-repository ppa:webupd8team/java
    sudo apt-get install oracle-java8-installer
    sudo apt-get install oracle-java8-set-default

    # set path and variable
    export JAVA_HOME=/usr/lib/jvm/java-8-oracle
    export JRE_HOME=/usr/lib/jvm/java-8-oracle/jre

    # install scala
    sudo apt-get install scala

    # install python-java integration
    pip3 install py4j

    # download and untagz
    sudo tar -zxvf spark-2.3.1-bin-hadoop2.7.tgz # use latest one

    # edit path and variables
    $vim .bashrc

    export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
    export PYSPARK_DRIVER_PYTHON=jupyter
    export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
    export PYSPARK_PYTHON=python3
    export JAVA_HOME=/usr/lib/jvm/java-8-oracle/jre
    export SPARK_HOME=/home/henry/apache_spark/spark-2.4.0-bin-hadoop2.7
    export PATH=$PATH:$JAVA_HOME/bin:$SPARK_HOME/bin
    # shift + : -> wq -> enter
    # run pyspark
    pyspark

reference

< !-- add by yurixu 替换Google的jquery并且添加判断逻辑 -->