Hadoop Ecosystem Deployment

#Install Jdk

[参考Jdk安装](http://note.louyj.com/blog/post/louyj/Install-JDK)

#Configure SSH

[参考SSH免密码登陆](http://note.louyj.com/blog/post/louyj/SSH%E5%85%8D%E5%AF%86%E7%A0%81%E7%99%BB%E9%99%86)

#Install Ntp

```
yum install ntp

vi /etc/ntp.conf
server ntp1.aliyun.com iburst
server ntp2.aliyun.com iburst
server ntp3.aliyun.com iburst

systemctl enable ntpd
systemctl start ntpd
```

#Install Hadoop

##Download Hadoop

wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.5/hadoop-2.7.5.tar.gz
    tar zxvf hadoop-2.7.5.tar.gz 
    cd hadoop-2.7.5/
    
##Configure Hadoop

Config Env

```
vi .bashrc
export JAVA_HOME=/opt/jdk1.8.0_202
export HADOOP_PID_DIR=/data/hadooptemp
```
    
Configure slaves

```
vi slaves
test01
test02
test03
```

Configure core-site.xml

```
mkdir /data/hadoop
mkdir /data/hadooptemp
```
    
```
<configuration>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/data/hadoop</value>
        <description>
            A base for other temporary directories.
        </description>
    </property>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://cluster01</value>
        <description>
            The name of the default file system.
            A URI whose scheme and authority determine the FileSystem implementation.
            The uri's scheme determines the config property (fs.SCHEME.impl) naming the FileSystem implementation class.
            The uri's authority is used to determine the host, port, etc. for a filesystem.
        </description>
    </property>
    <property>
      <name>io.file.buffer.size</name>
      <value>40960</value>
      <description>
            The size of buffer for use in sequence files.
            The size of this buffer should probably be a multiple of hardware page size (4096 on Intel x86),
            and it determines how much data is buffered during read and write operations.
      </description>
    </property>
    <property>
      <name>hadoop.proxyuser.hadoop.groups</name>
      <value>*</value>
      <description>
             The 'nfsserver' user is allowed to proxy all members of the 'nfs-users1' and
             'nfs-users2' groups. Set this to '*' to allow nfsserver user to proxy any group.
      </description>
    </property>
    <property>
      <name>hadoop.proxyuser.hadoop.hosts</name>
      <value>*</value>
      <description>
             This is the host where the nfs gateway is running. Set this to '*' to allow
             requests from any hosts to be proxied.
      </description>
    </property>
    <property>
       <name>ha.zookeeper.quorum</name>
       <value>test01:2181,test02:2181,test03:2181</value>
    </property>
</configuration>
```
    
Configure hdfs-site.xml

```
<configuration>
    <property>
      <name>dfs.nameservices</name>
      <value>cluster01</value>
    </property>
    <property>
      <name>dfs.ha.namenodes.cluster01</name>
      <value>nn1,nn2</value>
    </property>
    <property>
      <name>dfs.namenode.rpc-address.cluster01.nn1</name>
      <value>test01:8020</value>
    </property>
    <property>
      <name>dfs.namenode.rpc-address.cluster01.nn2</name>
      <value>test02:8020</value>
    </property>
    <property>
      <name>dfs.namenode.http-address.cluster01.nn1</name>
      <value>test01:9870</value>
    </property>
    <property>
      <name>dfs.namenode.http-address.cluster01.nn2</name>
      <value>test02:9870</value>
    </property>
    <property>
      <name>dfs.namenode.shared.edits.dir</name>
      <value>qjournal://test01:8485;test02:8485;test03:8485/cluster01</value>
    </property>
    <property>
      <name>dfs.client.failover.proxy.provider.cluster01</name>
      <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <property>
      <name>dfs.ha.fencing.methods</name>
      <value>sshfence</value>
    </property>
    <property>
      <name>dfs.ha.fencing.ssh.private-key-files</name>
      <value>/home/exampleuser/.ssh/id_rsa</value>
    </property>
    <property>
      <name>dfs.journalnode.edits.dir</name>
      <value>/data/journalnode</value>
    </property>
    <property>
      <name>dfs.ha.automatic-failover.enabled</name>
      <value>true</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
        <description>
            Default block replication.
            The actual number of replications can be specified when the file is created.
            The default is used if replication is not specified in create time.
        </description>
    </property>
    <property>
        <name>dfs.blocksize</name>
        <value>1048576</value>
        <description>
            1m.
            The default block size for new files, in bytes.
            You can use the following suffix (case insensitive): k(kilo), m(mega), g(giga), t(tera), p(peta), e(exa) to specify the size (such as 128k, 512m, 1g, etc.),
            Or provide complete size in bytes (such as 134217728 for 128 MB).
        </description>
    </property>
    <property>
        <name>dfs.namenode.handler.count</name>
        <value>100</value>
        <description>
            The number of Namenode RPC server threads that listen to requests from clients.
            If dfs.namenode.servicerpc-address is not configured then Namenode RPC server threads listen to requests from all nodes.
        </description>
    </property>
     <property>
        <name>nfs.dump.dir</name>
        <value>/data/hadoopdump</value>
        <description>
            This directory is used to temporarily save out-of-order writes before writing to HDFS.
            For each file, the out-of-order writes are dumped after they are accumulated to exceed certain threshold (e.g., 1MB) in memory.
            One needs to make sure the directory has enough space.
        </description>
      </property>
</configuration>
```
    
Configure mapred-site.xml

```
<configuration>
    
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
        <description>
            The runtime framework for executing MapReduce jobs.
            Can be one of local, classic or yarn.
        </description>
    </property>
    <property>
        <name>mapreduce.map.memory.mb</name>
        <value>1024</value>
        <description>
            The amount of memory to request from the scheduler for each map task.
            If this is not specified or is non-positive, it is inferred from mapreduce.map.java.opts and mapreduce.job.heap.memory-mb.ratio.
            If java-opts are also not specified, we set it to 1024.
        </description>
    </property>
    <property>
        <name>mapreduce.map.java.opts</name>
        <value>-Xmx256M</value>
        <description>
            Java option for map.
        </description>
    </property>
    <property>
        <name>mapreduce.reduce.memory.mb</name>
        <value>1024</value>
        <description>
            The amount of memory to request from the scheduler for each reduce task.
            If this is not specified or is non-positive, it is inferred from mapreduce.reduce.java.opts and mapreduce.job.heap.memory-mb.ratio.
            If java-opts are also not specified, we set it to 1024.
        </description>
    </property>
    <property>
        <name>mapreduce.reduce.java.opts</name>
        <value>-Xmx256M</value>
        <description>
            Java option for reduce.
        </description>
    </property>
    <property>
        <name>mapreduce.reduce.shuffle.parallelcopies</name>
        <value>5</value>
        <description>
            The default number of parallel transfers run by reduce during the copy(shuffle) phase.
        </description>
    </property>
    
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>test01:10020</value>
        <description>
            MapReduce JobHistory Server IPC host:port
        </description>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>test01:19888</value>
        <description>
            MapReduce JobHistory Server Web UI host:port
        </description>
    </property>
</configuration>
```
    
Configure yarn-site.xml

```
<configuration>
    
    <property>
        <name>yarn.acl.enable</name>
        <value>false</value>
        <description>
            Are acls enabled.
        </description>
    </property>
    <property>
        <name>yarn.admin.acl</name>
        <value>*</value>
        <description>
            ACL of who can be admin of the YARN cluster.
            ACLs are of for comma-separated-usersspacecomma-separated-groups.
            Defaults to special value of * which means anyone.
            Special value of just space means no one has access.
        </description>
    </property>
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
        <description>
            Configuration to enable or disable log aggregation
        </description>
    </property>
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>604800</value>
        <description>
           How long to keep aggregation logs before deleting them. -1 disables. Be careful set this too small and you will spam the name node.
        </description>
    </property>
    <property>
        <name>yarn.log-aggregation.retain-check-interval-seconds</name>
        <value>3600</value>
        <description>
            How long to wait between aggregated log retention checks.
        </description>
    </property>
    
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>test01:8032</value>
        <description>
            The address of the applications manager interface in the RM.
        </description>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>test01:8030</value>
        <description>
            The address of the scheduler interface.
        </description>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>test01:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address</name>
        <value>test01:8033</value>
        <description>
            The address of the RM admin interface.
        </description>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>test01:8088</value>
        <description>
            The http address of the RM web application.
            If only a host is provided as the value, the webapp will be served on a random port.
        </description>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
        <description>
            The class to use as the resource scheduler.
            CapacityScheduler (recommended), FairScheduler (also recommended), or FifoScheduler
        </description>
    </property>
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>1</value>
        <description>
            The minimum allocation for every container request at the RM, in MBs.
            Memory requests lower than this will throw a InvalidResourceRequestException.
        </description>
    </property>
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>2048</value>
        <description>
            The maximum allocation for every container request at the RM, in MBs.
            Memory requests higher than this will throw a InvalidResourceRequestException.
        </description>
    </property>
    
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>10240</value>
        <description>
            must larger than 1G
            Amount of physical memory, in MB, that can be allocated for containers.
            If set to -1 and yarn.nodemanager.resource.detect-hardware-capabilities is true, it is automatically calculated(in case of Windows and Linux).
            In other cases, the default is 8192MB.
        </description>
    </property>
    <property>
        <name>yarn.nodemanager.resource.detect-hardware-capabilities</name>
        <value>false</value>
        <description>
            Enable auto-detection of node capabilities such as memory and CPU.
        </description>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-pmem-ratio</name>
        <value>2.1</value>
        <description>
            Ratio between virtual memory to physical memory when setting memory limits for containers.
            Container allocations are expressed in terms of physical memory, and virtual memory usage is allowed to exceed this allocation by this ratio.
        </description>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
        <description>
        </description>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
        <description>
            A comma separated list of services where service name should only contain a-zA-Z0-9_ and can not start with numbers
        </description>
    </property>
        <property>
      <name>yarn.resourcemanager.ha.enabled</name>
      <value>true</value>
    </property>
    <property>
      <name>yarn.resourcemanager.cluster-id</name>
      <value>cluster01</value>
    </property>
    <property>
      <name>yarn.resourcemanager.ha.rm-ids</name>
      <value>rm1,rm2</value>
    </property>
    <property>
      <name>yarn.resourcemanager.hostname.rm1</name>
      <value>test01</value>
    </property>
    <property>
      <name>yarn.resourcemanager.hostname.rm2</name>
      <value>test02</value>
    </property>
    <property>
      <name>yarn.resourcemanager.webapp.address.rm1</name>
      <value>test01:8088</value>
    </property>
    <property>
      <name>yarn.resourcemanager.webapp.address.rm2</name>
      <value>test01:8088</value>
    </property>
    <property>
      <name>yarn.resourcemanager.zk-address</name>
      <value>test01:2181,test02:2181,test03:2181</value>
    </property>
</configuration>
```

capacity-scheduler.xml, refresh using `yarn rmadmin -refreshQueues`

```
    <property>
        <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
        <value>0.8</value>
    </property>
```
	
##Copy To Slave Nodes

```
scp -r ~/.bashrc test02:~
scp -r ~/.bashrc test03:~

scp -r ~/hadoop-2.7.5/ test02:~/hadoop-2.7.5/
scp -r ~/hadoop-2.7.5/ test03:~/hadoop-2.7.5/
```

#Start Service

In general, it is recommended that HDFS and YARN run as separate users. In the majority of installations, HDFS processes execute as ‘hdfs’. YARN is typically using the ‘yarn’ account.

##Initializing HA state in ZooKeeper

./bin/hdfs zkfc -formatZK

## Format Namenode

on test01

./bin/hdfs namenode -format
    scp -r /data/hadoop/dfs/name/ test-02:/home/hadoop/dfs
    
on test02

hdfs namenode -bootstrapStandby
    
## Start hdfs

on test01

./sbin/start-dfs.sh
    
    
check status

hdfs haadmin -getServiceState nn1
    hdfs haadmin -getServiceState nn2
    
## Start yarn

./sbin/start-yarn.sh
    
## Start WebAppProxy

./sbin/yarn-daemon.sh start proxyserver
    
##Start Jobhistorm Server

./sbin/mr-jobhistory-daemon.sh start historyserver

#Check Cluster State

Cluster Status

./bin/hdfs dfsadmin -report
    
HDFS

http://test01:9870
    
RM

http://test01:8088
    
History

http://test01:19888
    
    
#Test

hdfs dfs -mkdir -p /test/wordcount
    hdfs dfs -mkdir -p /test/output
    hdfs dfs -put /root/hadoop-2.7.5/etc/hadoop/core-site.xml /test/wordcount
    hdfs dfs -ls /test/wordcount
    hadoop jar /root/hadoop-2.7.5/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.4.jar wordcount /test/wordcount /test/output/wordcount 
    
#Flink On Yarn

vi ~/.bashrc

```
export HADOOP_HOME=/data/hadoop-2.7.5
export HADOOP_CLASSPATH=`hadoop classpath`
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
```

Starting a Flink Session on YARN

```
# (1) Start YARN Session
./bin/yarn-session.sh --detached -nm flink1.11.6
# (2) You can now access the Flink Web Interface through the
# URL printed in the last lines of the command output, or through
# the YARN ResourceManager web UI.

# (3) Submit example job
./bin/flink run ./examples/streaming/TopSpeedWindowing.jar

# (4) Stop YARN session (replace the application id based 
# on the output of the yarn-session.sh command)
echo "stop" | ./bin/yarn-session.sh -id application_XXXXX_XXX
```

Application Mode

Application Mode will launch a Flink cluster on YARN, where the main() method of the application jar gets executed on the JobManager in YARN. The cluster will shut down as soon as the application has finished.

```
./bin/flink run-application -t yarn-application ./examples/streaming/TopSpeedWindowing.jar
# List running job on the cluster
./bin/flink list -t yarn-application -Dyarn.application.id=application_XXXX_YY
# Cancel running job
./bin/flink cancel -t yarn-application -Dyarn.application.id=application_XXXX_YY <jobId>
```

##Flink on Hudi

Hudi works with Flink-1.13.x version. The hudi-flink-bundle jar is archived with scala 2.11, so it’s recommended to use flink 1.13.x bundled with scala 2.11. (hadoop 2.9+)

```
wget https://archive.apache.org/dist/flink/flink-1.13.5/flink-1.13.5-bin-scala_2.11.tgz
wget https://repo.maven.apache.org/maven2/org/apache/hudi/hudi-flink-bundle_2.11/0.10.0/hudi-flink-bundle_2.11-0.10.0.jar
```

Start a standalone Flink cluster within hadoop environment.

```
#taskmanager.numberOfTaskSlots: 4
./bin/yarn-session.sh --detached
```

create sql env file `sql-env.yaml`

```
deployment:
  yid: application_1638460628585_0002
```

Now starts the SQL CLI:

```
export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`
./bin/sql-client.sh embedded -e sql-env.yaml -j hudi-flink-bundle_2.11-0.10.0.jar shell

./bin/sql-client.sh embedded -s yarn-session -j hudi-flink-bundle_2.11-0.10.0.jar shell
```

Insert Data

```
set execution.result-mode=tableau;

CREATE TABLE t1(
  uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
  name VARCHAR(10),
  age INT,
  ts TIMESTAMP(3),
  `partition` VARCHAR(20)
)
PARTITIONED BY (`partition`)
WITH (
  'connector' = 'hudi',
  'path' = '/hudi/t1',
  'table.type' = 'MERGE_ON_READ' -- this creates a MERGE_ON_READ table, by default is COPY_ON_WRITE
);

-- insert data using values
INSERT INTO t1 VALUES
  ('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),
  ('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),
  ('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),
  ('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),
  ('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),
  ('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),
  ('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),
  ('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4');
```

View The Files

hdfs dfs -ls /hudi/t1
    
Query Data

select * from t1;

Update Data

```
insert into t1 values
  ('id1','Danny',29,TIMESTAMP '1970-01-01 00:00:01','par1');
```

Streaming Query

```
cp /home/hadoop/hadoop-2.7.5/share/hadoop/mapreduce/hadoop-mapreduce-client*.jar flink-1.13.5/lib
```

```
CREATE TABLE t1(
  uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
  name VARCHAR(10),
  age INT,
  ts TIMESTAMP(3),
  `partition` VARCHAR(20)
)
PARTITIONED BY (`partition`)
WITH (
  'connector' = 'hudi',
  'path' = '/hudi/t1',
  'table.type' = 'MERGE_ON_READ',
  'read.streaming.enabled' = 'true',  -- this option enable the streaming read
  'read.start-commit' = '20210316134557', -- specifies the start commit instant time
  'read.streaming.check-interval' = '4' -- specifies the check interval for finding new source commits, default 60s.
);

-- Then query the table in stream mode
select * from t1;
```

#Iceberg

##Flink SQL Client

Downloading the flink 1.11.x binary package from the apache flink download page. We now use scala 2.12 to archive the apache iceberg-flink-runtime jar, so it’s recommended to use flink 1.11 bundled with scala 2.12.

wget https://archive.apache.org/dist/flink/flink-1.11.1/flink-1.11.1-bin-scala_2.12.tgz
	cd /home/hadoop/flink-1.11.1
	cp -r lib lib-bk
	cd lib
	cp /home/hadoop/hadoop-2.7.5/share/hadoop/mapreduce/hadoop-mapreduce-client-* .
    
Start a standalone flink cluster within hadoop environment.

./bin/yarn-session.sh --detached -nm flink1.11.1
    
Start the flink SQL client.

We’ve created a separate flink-runtime module in iceberg project to generate a bundled jar, which could be loaded by flink SQL client directly.

```
https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime/
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime/0.11.1/iceberg-flink-runtime-0.11.1.jar

# download Iceberg dependency
ICEBERG_VERSION=0.11.1
MAVEN_URL=https://repo1.maven.org/maven2
ICEBERG_MAVEN_URL=${MAVEN_URL}/org/apache/iceberg
ICEBERG_PACKAGE=iceberg-flink-runtime
wget ${ICEBERG_MAVEN_URL}/${ICEBERG_PACKAGE}/${ICEBERG_VERSION}/${ICEBERG_PACKAGE}-${ICEBERG_VERSION}.jar

./bin/sql-client.sh embedded -e sql-env.yaml -j iceberg-flink-runtime-0.11.1.jar shell
```

By default, iceberg has included hadoop jars for hadoop catalog. If we want to use hive catalog, we will need to load the hive jars when opening the flink sql client. Fortunately, apache flink has provided a bundled hive jar for sql client. So we could open the sql client as the following:

```
wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.11.1/flink-sql-connector-hive-3.1.2_2.12-1.11.1.jar

HIVE_VERSION=3.1.2
SCALA_VERSION=2.12
FLINK_VERSION=1.11.1
FLINK_CONNECTOR_URL=${MAVEN_URL}/org/apache/flink
FLINK_CONNECTOR_PACKAGE=flink-sql-connector-hive
wget ${FLINK_CONNECTOR_URL}/${FLINK_CONNECTOR_PACKAGE}-${HIVE_VERSION}_${SCALA_VERSION}/${FLINK_VERSION}/${FLINK_CONNECTOR_PACKAGE}-${HIVE_VERSION}_${SCALA_VERSION}-${FLINK_VERSION}.jar

./bin/sql-client.sh embedded -e sql-env.yaml -j iceberg-flink-runtime-0.11.1.jar -j flink-sql-connector-hive-3.1.2_2.12-1.11.1.jar shell
```

Hive catalog

```
CREATE CATALOG hive_catalog WITH (
  'type'='iceberg',
  'catalog-type'='hive',
  'uri'='thrift://test01:9083',
  'clients'='5',
  'property-version'='1',
  'warehouse'='hdfs://test01:9000/user/hive/warehouse'
);
```

- uri: The Hive metastore’s thrift URI. (Required)
- clients: The Hive metastore client pool size, default value is 2. (Optional)
- warehouse: The Hive warehouse location, users should specify this path if neither set the hive-conf-dir to specify a location containing a hive-site.xml configuration file nor add a correct hive-site.xml to classpath.
- hive-conf-dir: Path to a directory containing a hive-site.xml configuration file which will be used to provide custom Hive configuration values. The value of hive.metastore.warehouse.dir from <hive-conf-dir>/hive-site.xml (or hive configure file from classpath) will be overwrote with the warehouse value if setting both hive-conf-dir and warehouse when creating iceberg catalog.

Hadoop catalog

./bin/sql-client.sh embedded -e sql-env.yaml -j iceberg-flink-runtime-0.11.1.jar  shell

```
CREATE CATALOG hadoop_catalog WITH (
  'type'='iceberg',
  'catalog-type'='hadoop',
  'warehouse'='hdfs://test01:9000/user/iceberg/warehouse',
  'property-version'='1'
);

use catalog hadoop_catalog;
use `default`;
```

- warehouse: The HDFS directory to store metadata files and data files. (Required)

CREATE DATABASE

```
CREATE DATABASE iceberg_db;
USE iceberg_db;
```

CREATE TABLE

To create a partition table, use PARTITIONED BY:

```
CREATE TABLE `hadoop_catalog`.`default`.`sample` (
    id BIGINT COMMENT 'unique id',
    data STRING
);

CREATE TABLE `hadoop_catalog`.`default`.`sample` (
    id BIGINT COMMENT 'unique id',
    data STRING
) PARTITIONED BY (data);
```

Querying with SQL

Iceberg support both streaming and batch read in flink now. we could execute the following sql command to switch the execute type from ‘streaming’ mode to ‘batch’ mode, and vice versa:

```
-- Execute the flink job in streaming mode for current session context
SET execution.type = streaming

-- Execute the flink job in batch mode for current session context
SET execution.type = batch
```

Flink batch read

```
-- Execute the flink job in batch mode for current session context
SET execution.type = batch ;
SELECT * FROM sample;
```

Flink streaming read

```
-- Submit the flink job in streaming mode for current session.
SET execution.type = streaming ;

-- Enable this switch because streaming read SQL will provide few job options in flink SQL hint options.
SET table.dynamic-table-options.enabled=true;

-- Read all the records from the iceberg current snapshot, and then read incremental data starting from that snapshot.
SELECT * FROM sample /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/ ;

-- Read all incremental data starting from the snapshot-id '3821550127947089987' (records from this snapshot will be excluded).
SELECT * FROM sample /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-snapshot-id'='3821550127947089987')*/ ;
```

Writing with SQL

Iceberg support both INSERT INTO and INSERT OVERWRITE in flink 1.11 now.

INSERT INTO

```
INSERT INTO `sample` VALUES (1, 'a');
INSERT INTO `sample` VALUES (2, 'b');
INSERT INTO `sample` SELECT id, data from other_kafka_table;
```

#Spark On Yarn

Configure Env

vi ~/.bashrc

```
export HADOOP_HOME=/data/hadoop-2.7.5
export HADOOP_CLASSPATH=`hadoop classpath`
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
```

Download Spark

wget https://dlcdn.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
    
Start Spark Application(cluster/client)

./bin/spark-submit --class path.to.your.Class --master yarn --deploy-mode cluster [options] <app jar> [app options]
    
e.g.

```
 ./bin/spark-submit --class org.apache.spark.examples.SparkPi \
    --master yarn \
    --deploy-mode cluster \
    --driver-memory 4g \
    --executor-memory 2g \
    --executor-cores 1 \
    --queue thequeue \
    --jars my-other-jar.jar,my-other-other-jar.jar \
    examples/jars/spark-examples*.jar \
    10
```

Start Spark Thrift server

```
export HIVE_SERVER2_THRIFT_PORT=10000
export HIVE_SERVER2_THRIFT_BIND_HOST=test01
./sbin/start-thriftserver.sh --master  yarn \
--packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
--conf 'spark.kryoserializer.buffer.max=128m' 
```

#Zookeeper

Download Zookeeper

wget --no-check-certificate https://dlcdn.apache.org/zookeeper/zookeeper-3.6.3/apache-zookeeper-3.6.3-bin.tar.gz
    tar zxvf apache-zookeeper-3.6.3-bin.tar.gz
    
Configure Zookeeper

```
cp zoo_sample.cfg zoo.cfg
```

vi zoo.cfg

```
dataDir=/data/zookeeper
server.1=test01:2888:3888
server.2=test02:2888:3888
server.3=test03:2888:3888
```

Create Data Directory

```
ssh test01  'mkdir -p /data/zookeeper && echo 1 > /data/zookeeper/myid'
ssh test02  'mkdir -p /data/zookeeper && echo 2 > /data/zookeeper/myid'
ssh test03  'mkdir -p /data/zookeeper && echo 3 > /data/zookeeper/myid'
```

Copy To All Nodes

```
scp -r apache-zookeeper-3.6.3-bin test02:~
scp -r apache-zookeeper-3.6.3-bin test03:~
```

Start Zookeeper

```
ssh test01  'cd ~/apache-zookeeper-3.6.3-bin && bin/zkServer.sh start'
ssh test02  'cd ~/apache-zookeeper-3.6.3-bin && bin/zkServer.sh start'
ssh test03  'cd ~/apache-zookeeper-3.6.3-bin && bin/zkServer.sh start'
```

#Hbase

Download Hbase

wget https://downloads.apache.org/hbase/2.4.9/hbase-2.4.9-bin.tar.gz
    tar zxvf hbase-2.4.9-bin.tar.gz
    
Edit conf/regionservers

```
test02
test03
```

Create a new file in conf/backup-masters

```
test03
```
    
Edit the hbase-site.xml configuration.
    
```
<property>
  <name>hbase.cluster.distributed</name>
  <value>true</value>
</property>
<property>
  <name>hbase.rootdir</name>
  <value>hdfs://localhost:9000/hbase</value>
</property>
<property>
  <name>hbase.zookeeper.quorum</name>
  <value>test01,test02,test03</value>
</property>
<property>
  <name>hbase.zookeeper.property.dataDir</name>
  <value>/usr/local/zookeeper</value>
</property>
```

Finally, remove existing configuration for hbase.tmp.dir and hbase.unsafe.stream.capability.enforce

Sync Time

remote_time=`ssh test01 date +'%H:%M:%S'` && date -s $remote_time

Start HBase.

bin/start-hbase.sh
    
Check the HBase directory in HDFS.

hdfs dfs -ls /hbase

Web UI

http://test01:16010/master-status

#Phoenix

Download and expand the latest phoenix-hbase-[hbase.version][phoenix.version]-bin.tar.gz for your HBase version.

wget https://dlcdn.apache.org/phoenix/phoenix-5.1.2/phoenix-hbase-2.4-5.1.2-bin.tar.gz
	tar zxvf phoenix-hbase-2.4-5.1.2-bin.tar.gz
	phoenix-hbase-2.4-5.1.2-bin

Add the phoenix-server-hbase-[hbase.version]-[phoenix.version].jar to the classpath of all HBase region servers and masters and remove any previous version. An easy way to do this is to copy it into the HBase lib directory

```
scp phoenix-server-hbase-2.4-5.1.2.jar test01:~/hbase-2.4.9/lib/
scp phoenix-server-hbase-2.4-5.1.2.jar test02:~/hbase-2.4.9/lib/
scp phoenix-server-hbase-2.4-5.1.2.jar test03:~/hbase-2.4.9/lib/
```
	
Restart HBase.

```
bin/stop-hbase.sh
bin/start-hbase.sh
# or bin/rolling-restart.sh
```

Add the phoenix-client-hbase-[hbase.version]-[phoenix.version].jar to the classpath of any JDBC client.

##Command Line

A terminal interface to execute SQL from the command line is now bundled with Phoenix. To start it, execute the following from the bin directory:

sqlline.py test01:2181

#Tez

download

```
https://archive.apache.org/dist/tez/0.9.2/apache-tez-0.9.2-bin.tar.gz
```

Copy the relevant tez tarball into HDFS

```
hadoop fs -mkdir -p /apps/tez-0.9.2
hadoop fs -copyFromLocal  apache-tez-0.9.2-bin.tar.gz /apps/tez-0.9.2/
```

configure tez-site.xml

```
cd /home/hadoop/apache-tez-0.9.2-bin/conf
vi tez-site.xml
#add

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>tez.lib.uris</name>
	<value>${fs.defaultFS}/apps/tez-0.9.2/apache-tez-0.9.2-bin.tar.gz</value>
  </property>
</configuration>
```

Configure the client node to include the tez-libraries in the hadoop classpath

```
vi .bashrc

export TEZ_HOME=/home/hadoop/apache-tez-0.9.2-bin
export TEZ_CONF_DIR=$TEZ_HOME/conf
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:${TEZ_CONF_DIR}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*
```

#Mysql

Install Mysql

```
wget https://dev.mysql.com/get/mysql57-community-release-el7-9.noarch.rpm
sudo rpm -ivh mysql57-community-release-el7-9.noarch.rpm
sudo yum install mysql-server --nogpgcheck
sudo systemctl start mysqld
sudo systemctl status mysqld
```

During the installation process, a temporary password is generated for the MySQL root user. Locate it in the mysqld.log with this command:

sudo grep 'temporary password' /var/log/mysqld.log

Configuring MySQL

MySQL includes a security script to change some of the less secure default options for things like remote root logins and sample users.

Use this command to run the security script.

sudo mysql_secure_installation
	@WSX1qaz

Mysql Cli

mysql -u root -p
	
Allow Remote Access

```
use mysql;
update user set host='%' where user='root'
```

#Hive

Start by downloading the most recent stable release of Hive from one of the Apache download mirrors

```
wget https://archive.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
tar zxvf apache-hive-3.1.2-bin.tar.gz
```

Hive uses Hadoop, so: you must have Hadoop in your path

In addition, you must use below HDFS commands to create /tmp and /user/hive/warehouse (aka hive.metastore.warehouse.dir) and set them chmod g+w before you can create a table in Hive.

```
hdfs dfs -mkdir       /tmp
hdfs dfs -mkdir -p    /user/hive/warehouse
hdfs dfs -chmod g+w   /tmp
hdfs dfs -chmod g+w   /user/hive/warehouse
```

Configure Hive(hive-site.xml)

cp hive-default.xml.template hive-site.xml
	
edit hive-site.xml

```
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
 <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://test01:3306/metastore?useSSL=false</value>
    <description>
      JDBC connect string for a JDBC metastore.
      To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL.
      For example, jdbc:postgresql://myhost/db?ssl=true for postgres database.
    </description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>Username to use against metastore database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>@WSX1qaz</value>
    <description>password to use against metastore database</description>
  </property>
  <property>
    <name>hive.metastore.warehouse.dir</name>
    <value>/user/hive/warehouse</value>
    <description>default location for Hive tables.</description>
  </property>
  <property>
    <name>hive.metastore.port</name>
    <value>9083</value>
    <description>Hive metastore listener port</description>
  </property>
  <property>
    <name>hive.metastore.db.type</name>
    <value>mysql</value>
    <description>
      Expects one of [derby, oracle, mysql, mssql, postgres].
      Type of database used by the metastore. Information schema & JDBCStorageHandler depend on it.
    </description>
  </property>
   <property>
    <name>hive.server2.authentication</name>
    <value>CUSTOM</value>
  </property>
  <property>
    <name>hive.server2.custom.authentication.class</name>
    <value>com.xxx.xxx.plugin.hiveauth.UserPasswordAuthProvider</value>
  </property>
  <property>
    <name>hive.server2.custom.authentication.filepath</name>
    <value>/home/hadoop/spark-3.1.2-bin-hadoop2.7/conf/users</value>
  </property>
</configuration>
```

we need to run the schematool command below as an initialization step. For example, we can use "derby" as db type.

```
cp ~/mysql-connector-java-5.1.49.jar lib/
bin/schematool -dbType mysql -initSchema

```

HiveServer2 (introduced in Hive 0.11) has its own CLI called Beeline.  HiveCLI is now deprecated in favor of Beeline, as it lacks the multi-user, security, and other capabilities of HiveServer2.  To run HiveServer2 and Beeline from shell:

```
cp hive-default.xml.template hive-site.xml

#bin/hiveserver2
nohup bin/hiveserver2 & 
bin/beeline -u jdbc:hive2://
bin/beeline -u jdbc:hive2://test01:10000 -n hadoop #hive.server2.thrift.port
bin/beeline -u jdbc:hive2://$HS2_HOST:$HS2_PORT
```

web ui

http://test01:10002/

Start Metastore Server

```
nohup bin/hive --service metastore &
```

Metastore Url `thrift://test01:9083`

#Kylin

Download Kylin

```
wget https://dlcdn.apache.org/kylin/apache-kylin-4.0.1/apache-kylin-4.0.1-bin-spark3.tar.gz
```

Kylin 4.0 uses MySQL as metadata storage, create mysql database

create database kylin_metadata
	
make the following configuration in kylin.properties

```
kylin.metadata.url=kylin_metadata@jdbc,driverClassName=com.mysql.jdbc.Driver,url=jdbc:mysql://test01:3306/kylin_metadata,username=root,password=@WSX1qaz
kylin.env.zookeeper-connect-string=test01:2181

kylin.engine.spark-conf.spark.executor.memory=1G
kylin.engine.spark-conf.spark.driver.memory=1G
kylin.query.spark-conf.spark.driver.memory=1G
kylin.query.spark-conf.spark.executor.memory=1G
```

put mysql jdbc connector into $KYLIN_HOME/ext/, if there is no such directory, please create it.

```
mkdir ext
cp ~/apache-hive-3.1.2-bin/lib/mysql-connector-java-5.1.49.jar ext/
cp ~/apache-hive-3.1.2-bin/lib/mysql-connector-java-5.1.49.jar spark-3.1.1-bin-hadoop2.7/jars
```

Kylin runs on a Hadoop cluster and has certain requirements for the version, access rights, and CLASSPATH of each component. To avoid various environmental problems, you can run the script, $KYLIN_HOME/bin/check-env.sh to have a test on your environment

bin/check-env.sh
	
Start Kylin

bin/kylin.sh start

Web  UI

http://test01:7070/kylin

The initial username and password are ADMIN/KYLIN

Kylin will generate files on HDFS. The default root directory is “kylin/”, and then the metadata table name of kylin cluster will be used as the second layer directory name, and the default is “kylin_metadata”(can be customized in conf/kylin.properties)

#Hue

Install Hue

```
wget https://codeload.github.com/cloudera/hue/zip/refs/tags/release-4.10.0
unzip hue-release-4.10.0.zip

yum install python2-devel gcc krb5-devel libsq3-devel gcc-c++ libgsasl-devel openldap-devel openssl openssl-devel libxslt-devel rsync npm yum install cyrus-sasl-plain cyrus-sasl-devel cyrus-sasl-gssapi  cyrus-sasl-md5   
yum install mysql-devel --nogpgcheck

PREFIX=/home/hadoop/hue-4.10.0 make install
cd /home/hadoop/hue-4.10.0/hue
```

Configure Hue

```
cp desktop/conf/pseudo-distributed.ini.tmpl desktop/conf/pseudo-distributed.ini

vi desktop/conf/pseudo-distributed.ini

http_host=0.0.0.0
http_port=8888

[[database]]
    engine=mysql
    host=test01
    port=3306
    user=root
    password=@WSX1qaz
    name=hue
```

And run the table creation one time:

./build/env/bin/hue migrate

After the installation, you can start Hue on your Hue Server by doing:

```
build/env/bin/hue runserver 0.0.0.0:8888
nohup build/env/bin/hue runserver 0.0.0.0:8888 &
```

This will start several subprocesses, corresponding to the different Hue components. Your Hue installation is now running.

Web UI

http://test01:8888/

Create User `hadoop`

#Oozie

These instructions install and run Oozie using an embedded Jetty server and an embedded Derby database.

```
wget https://dlcdn.apache.org/oozie/5.2.1/oozie-5.2.1.tar.gz --no-check-certificate
```

The simplest way to build Oozie is to run the mkdistro.sh script:

yum install mvn
	 bin/mkdistro.sh -DskipTests -Dhadoop.version=2.7.5 -Dhive.version=3.1.2 -Dhbase.version=2.4.9 -Dtez.version=0.9.2 -Ptez  -Puber

Oozie ignores any set value for OOZIE_HOME, Oozie computes its home automatically.

Configure the Hadoop cluster with proxyuser for the Oozie process. The following two properties are required in Hadoop core-site.xml:

```

  <property>
    <name>hadoop.proxyuser.[OOZIE_SERVER_USER].hosts</name>
    <value>[OOZIE_SERVER_HOSTNAME]</value>
  </property>
  <property>
    <name>hadoop.proxyuser.[OOZIE_SERVER_USER].groups</name>
    <value>[USER_GROUPS_THAT_ALLOW_IMPERSONATION]</value>
  </property>
```

Replace the capital letter sections with specific values and then restart Hadoop.

e.g.

```
  <property>
    <name>hadoop.proxyuser.hadoop.hosts</name>
    <value>test01</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hadoop.groups</name>
    <value>hadoop</value>
  </property>
```

不兼容 hive3.x hbase2.x

#Azkaban

##Build From Source

```
wget https://github.com/azkaban/azkaban/archive/refs/tags/4.0.0.zip
tar zxvf 4.0.0
cd azkaban-4.0.0
sed -i 's#https://linkedin.bintray.com/maven#https://linkedin.jfrog.io/artifactory/open-source/#g' build.gradle
```

The following commands run on *nix platforms like Linux, OS X.

```
# Build and install distributions
./gradlew installDist

# Build Azkaban
./gradlew build

# Clean the build
./gradlew clean

# Run tests
./gradlew test

# Build without running tests
./gradlew build -x test
```

##Installing the Solo Server

Start the solo server:
 
	cd azkaban-solo-server/build/install/azkaban-solo-server; bin/start-solo.sh

Web UI:

http://localhost:8081/

The default login username and password for the solo server are both azkaban which is configured in conf/azkaban-users.xml in the resources folder of the solo server.

Stop server

bin/shutdown-solo.sh
	
##Create Flow

Create a simple file called `flow20.project`

```
azkaban-flow-version: 2.0
```

Create another file called `basic.flow`

```
nodes:
  - name: jobA
    type: command
    config:
      command: echo "This is an echoed text."
```

Select the two files you’ve already created and right click to compress them into a zip file called `Archive.zip`

```
zip -r Archive.zip .
```

You can then upload Archive.zip to your project through Web UI

###Job Dependencies

Jobs can have dependencies on each other. You can use dependsOn section to list all the parent jobs. In the below example, after jobA and jobB run successfully, jobC will start to run.

```
nodes:
  - name: jobC
    type: noop
    # jobC depends on jobA and jobB
    dependsOn:
      - jobA
      - jobB

- name: jobA
    type: command
    config:
      command: echo "This is an echoed text."

- name: jobB
    type: command
    config:
      command: pwd
```

You can zip the new basic.flow and flow20.project again and then upload it to Azkaban. Try to execute the flow and see the difference.

#Cassandra

Download the binary tarball from one of the mirrors on the Apache Cassandra Download site.

wget https://dlcdn.apache.org/cassandra/4.0.1/apache-cassandra-4.0.1-bin.tar.gz
	tar zxvf apache-cassandra-4.0.1-bin.tar.gz

Configure Cassandra

Configuring Cassandra is done by setting yaml properties in the cassandra.yaml file.

```
cluster_name: 'Test Cluster'
seed_provider:
    - class_name: org.apache.cassandra.locator.SimpleSeedProvider
      parameters:
          - seeds: "test01:7000,test02:7000,test03:7000"
storage_port: 7000
listen_address: 192.168.0.161
#listen_interface: ens192
native_transport_port: 9042
data_file_directories:
  - /data/cassandra/data
commitlog_directory: /data/cassandra/commitlog
cdc_raw_directory: /data/cassandra/cdc_raw
saved_caches_directory: /data/cassandra/saved_caches
hints_directory: /data/cassandra/hints
```

vi cassandra-env.sh

```
MAX_HEAP_SIZE="1G"
```

Copy to All Nodes

```
scp -r apache-cassandra-4.0.1/ test02:~
scp -r apache-cassandra-4.0.1/ test03:~
```

Start Cassandra:

```
bin/cassandra 2>&1 > /dev/null

ssh test01 'cd apache-cassandra-4.0.1 && bin/cassandra'
ssh test02 'cd apache-cassandra-4.0.1 && bin/cassandra'
ssh test03 'cd apache-cassandra-4.0.1 && bin/cassandra'
```

Monitor the progress of the startup with:

tail -f logs/system.log
	
Check the status of Cassandra:

bin/nodetool status
	
The status column in the output should report UN which stands for "Up/Normal".

connect to the database with:

bin/cqlsh

#Kudu

##Build Kudu

RHEL or CentOS 7.0 or later is required to build Kudu from source. To build on a version older than 8.0, the Red Hat Developer Toolset must be installed

Install the prerequisite libraries, if they are not installed.

```
sudo yum install autoconf automake cyrus-sasl-devel cyrus-sasl-gssapi \
  cyrus-sasl-plain flex gcc gcc-c++ gdb git java-1.8.0-openjdk-devel \
  krb5-server krb5-workstation libtool make openssl-devel patch \
  pkgconfig redhat-lsb-core rsync unzip vim-common which
```

If building on RHEL or CentOS older than 8.0, install the Red Hat Developer Toolset. Below are the steps required for CentOS.

```
sudo yum install centos-release-scl-rh
sudo yum install devtoolset-8
```

Optional: If support for Kudu’s NVM (non-volatile memory) block cache is desired, install the memkind library.

sudo yum install memkind
	
Clone the Git repository and change to the new kudu directory.

```
git clone https://github.com/apache/kudu
cd kudu
```

Build any missing third-party requirements using the build-if-necessary.sh script.

```
build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh
```

Build Kudu, using the utilities installed in the previous step. Choose a build directory for the intermediate output, which can be anywhere in your filesystem except for the kudu directory itself. 
Notice that the devtoolset must still be specified, else you’ll get `cc1plus: error: unrecognized command line option "-std=c++17"`.

```
mkdir -p build/release
cd build/release
../../build-support/enable_devtoolset.sh \
  ../../thirdparty/installed/common/bin/cmake \
  -DCMAKE_BUILD_TYPE=release ../..
make -j4
```

Optional: install Kudu executables, libraries and headers. The default installation directory is /usr/local. You can customize it through the DESTDIR environment variable.

sudo make DESTDIR=/home/hadoop/kudu-1.15.0 install
	mv ~kudu kudu-1.15.0/
	chown -R hadoop:hadoop kudu-1.15.0/
	export KUDU_HOME=/home/hadoop/kudu-1.15.0/kudu

##Configure And Start Kudu

To configure the behavior of each Kudu process, you can pass command-line flags when you start it, or read those options from configuration files by passing them using one or more --flagfile=<file> options. You can even include the --flagfile option within your configuration file to include other files.

You can place options for masters and tablet servers into the same configuration file, and each will ignore options that do not apply.

Create Data Directory

```
ssh test01 'mkdir -p /data/kudumaster/data; mkdir -p /data/kudumaster/metadata; mkdir -p /data/kudumaster/wal; mkdir -p /data/kudumaster/log; mkdir -p  /data/kuduserver/data; mkdir -p  /data/kuduserver/metadata; mkdir -p  /data/kuduserver/wal; mkdir -p  /data/kuduserver/log'
ssh test02 'mkdir -p /data/kudumaster/data; mkdir -p /data/kudumaster/metadata; mkdir -p /data/kudumaster/wal; mkdir -p /data/kudumaster/log; mkdir -p  /data/kuduserver/data; mkdir -p  /data/kuduserver/metadata; mkdir -p  /data/kuduserver/wal; mkdir -p  /data/kuduserver/log'
ssh test03 'mkdir -p /data/kudumaster/data; mkdir -p /data/kudumaster/metadata; mkdir -p /data/kudumaster/wal; mkdir -p /data/kudumaster/log; mkdir -p  /data/kuduserver/data; mkdir -p  /data/kuduserver/metadata; mkdir -p  /data/kuduserver/wal; mkdir -p  /data/kuduserver/log'
```

Configuring the Kudu Master

To see all available configuration options for the kudu-master executable, run it with the --help option: `kudu-master --help`

start_master.sh

```
cd /home/hadoop/kudu-1.15.0/usr/local
host=`hostname`
if [[ $host == 'test01' ]]; then
        nohup sbin/kudu-master --master_addresses test01 --fs_data_dirs /data/kudumaster/data --fs_metadata_dir /data/kudumaster/metadata --fs_wal_dir /data/kudumaster/wal --log_dir /data/kudumaster/log > /dev/null 2>&1 &
elif [[ $host == 'test02' ]]; then
        nohup sbin/kudu-master --master_addresses test01,test02 --fs_data_dirs /data/kudumaster/data --fs_metadata_dir /data/kudumaster/metadata --fs_wal_dir /data/kudumaster/wal --log_dir /data/kudumaster/log >/dev/null 2>&1 &
elif [[ $host == 'test03' ]]; then
        nohup sbin/kudu-master --master_addresses test01,test02,test03 --fs_data_dirs /data/kudumaster/data --fs_metadata_dir /data/kudumaster/metadata --fs_wal_dir /data/kudumaster/wal --log_dir /data/kudumaster/log >/dev/null 2>&1 &
fi
```

```
ssh test01 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_master.sh'
ssh test02 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_master.sh'
ssh test03 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_master.sh'
```

Configuring Tablet Servers

start_server.sh

```
cd /home/hadoop/kudu-1.15.0/usr/local
nohup sbin/kudu-tserver --tserver_master_addrs test01:7051,test02:7051,test03:7051 --fs_data_dirs /data/kuduserver/data --fs_metadata_dir /data/kuduserver/metadata --fs_wal_dir /data/kuduserver/wal --log_dir /data/kuduserver/log >/dev/null 2>&1 &
```

```
ssh test01 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_server.sh'
ssh test02 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_server.sh'
ssh test03 'cd /home/hadoop/kudu-1.15.0/usr/local && ./start_server.sh'
```

##Administering Kudu

Web UI

http://test01:8051

#Trino

##Installing Trino

Download the Trino server tarball, trino-server-369.tar.gz, and unpack it.

wget https://repo1.maven.org/maven2/io/trino/trino-server/369/trino-server-369.tar.gz
	tar zxvf trino-server-369.tar.gz
	cd trino-server-369
	
##Configuring Trino

Create an etc directory inside the installation directory. This holds the following configuration:

```
mkdir etc
cd etc
```

Node properties `node.properties`

```
node.environment=production
node.data-dir=/data/presto
node.id={nodeid}
```

JVM config `jvm.config`

```
-server
-Xmx1500M
-XX:-UseBiasedLocking
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+ExplicitGCInvokesConcurrent
-XX:+ExitOnOutOfMemoryError
-XX:+HeapDumpOnOutOfMemoryError
-XX:-OmitStackTraceInFastThrow
-XX:ReservedCodeCacheSize=216M
-XX:PerMethodRecompilationCutoff=10000
-XX:PerBytecodeRecompilationCutoff=10000
-Djdk.attach.allowAttachSelf=true
-Djdk.nio.maxCachedBufferSize=2000000
```

Config properties for coordinator `coordinator-config.properties`

```
coordinator=true
node-scheduler.include-coordinator=false
http-server.http.port=8880
query.max-memory=5GB
query.max-memory-per-node=1GB
discovery.uri=http://test01:8880
```

Config properties for workers `worker-config.properties`

```
coordinator=false
http-server.http.port=8880
query.max-memory=5GB
query.max-memory-per-node=1GB
discovery.uri=http://test01:8880
```

Log levels `log.properties`

```
io.trino=INFO
```

Catalog properties

```
mkdir catalog
cd catalog
```

kudu connector

```
vi kudu.properties

connector.name=kudu
kudu.client.master-addresses=test01:7051,test02:7051,test03:7051
kudu.schema-emulation.enabled=false
kudu.schema-emulation.prefix=
kudu.client.default-admin-operation-timeout = 60s
kudu.client.default-operation-timeout = 300s
kudu.client.default-socket-read-timeout = 60s
kudu.client.disable-statistics = false
```

hive connector

```
vi hive.properties

connector.name=hive
hive.metastore.uri=thrift://test01:9083
```

phoenix connector

```
vi phoenix.properties

connector.name=phoenix5
phoenix.connection-url=jdbc:phoenix:test01,test02,test03:2181:/hbase
```

clickhouse connector

```
vi clickhouse.properties
 
connector.name=clickhouse
connection-url=jdbc:clickhouse://test01:8123/
connection-user=default
connection-password=
```

##Running Trino

Install Jdk11

yum install java-11-openjdk
	alternatives --config java
	java -version

The installation directory contains the launcher script in bin/launcher. Trino can be started as a daemon by running the following:

```
ssh test01 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/coordinator-config.properties start'
ssh test02 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/worker-config.properties start'
ssh test03 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/worker-config.properties start'
```
	
restart

```
ssh test01 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/coordinator-config.properties restart'
ssh test02 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/worker-config.properties restart'
ssh test03 'cd /home/hadoop/trino-server-369 && bin/launcher --config=etc/worker-config.properties restart'
```

Alternatively, it can be run in the foreground, with the logs and other output written to stdout/stderr.

bin/launcher run

Web Ui

http://test01:8880

#Impala

Download the latest release

wget https://dlcdn.apache.org/impala/4.0.0/apache-impala-4.0.0.tar.gz --no-check-certificate
	 
##Impala Components

Impala primarily consists of these executables, which should be available after you build from source:

- impalad - The Impala daemon. Plans and executes queries against HDFS, HBase, and Amazon S3 data. Run one impalad process on each node in the cluster that has a DataNode.
- statestored - Name service that tracks location and status of all impalad instances in the cluster. Run one instance of this daemon on a node in your cluster. Most production deployments run this daemon on the namenode.
- catalogd - Metadata coordination service that broadcasts changes from Impala DDL and DML statements to all affected Impala nodes, so that new tables, newly loaded data, and so on are immediately visible to queries submitted through any Impala node. (Prior to Impala 1.2, you had to run the REFRESH or INVALIDATE METADATA statement on each node to synchronize changed metadata. Now those statements are only required if you perform the DDL or DML through an external mechanism such as Hive or by uploading data to the Amazon S3 filesystem.) Run one instance of this daemon on a node in your cluster, preferably on the same host as the statestored daemon.
- impala-shell - Command-line interface for issuing queries to the Impala daemon. You install this on one or more hosts anywhere on your network, not necessarily DataNodes or even within the same cluster as Impala. It can connect remotely to any instance of the Impala daemon.

##Build Impala

```
tar zxvf apache-impala-4.0.0.tar.gz
cd apache-impala-4.0.0

#see README-build.md
```

Impala can be built with pre-built components or components downloaded from S3.
The components needed to build Impala are Apache Hadoop, Hive, and HBase.
If you need to manually override the locations or versions of these components, you
can do so through the environment variables and scripts listed below.

```
export IMPALA_HOME=`pwd`
./bin/bootstrap_system.sh
source ./bin/impala-config.sh
export HADOOP_HOME=/home/hadoop/hadoop-2.7.5
export HIVE_HOME=/home/hadoop/apache-hive-3.1.2-bin
export HBASE_HOME=/home/hadoop/hbase-2.4.9
export HADOOP_INCLUDE_DIR=${HADOOP_HOME}/include
export HADOOP_LIB_DIR=${HADOOP_HOME}/lib
```

#Clickhouse

First, you need to add the official repository:

```
sudo yum install yum-utils
sudo rpm --import https://repo.clickhouse.com/CLICKHOUSE-KEY.GPG
sudo yum-config-manager --add-repo https://repo.clickhouse.com/rpm/stable/x86_64
```

Then run these commands to install packages:

sudo yum install clickhouse-server clickhouse-client
	
Configure ClickHouse `/etc/clickhouse-server/config.xml`

```
<listen_host>0.0.0.0</listen_host>
<http_port>8123</http_port>
<tcp_port>19000</tcp_port>
<mysql_port>19004</mysql_port>
<postgresql_port>19005</postgresql_port>
<interserver_http_port>19009</interserver_http_port>
<user_directories>
	<local_directory>
		<path>/data/clickhouse/access/</path>
	</local_directory>
</user_directories>
<remote_servers>
    <perftest_3shards_1replicas>
        <shard>
            <replica>
                <host>test01</host>
                <port>19000</port>
            </replica>
        </shard>
        <shard>
            <replica>
                <host>test02</host>
                <port>19000</port>
            </replica>
        </shard>
        <shard>
            <replica>
                <host>test03</host>
                <port>19000</port>
            </replica>
        </shard>
    </perftest_3shards_1replicas>
</remote_servers>
<path>/data/clickhouse/</path>
<tmp_path>/data/clickhouse/tmp/</tmp_path>
<user_files_path>/data/clickhouse/user_files/</user_files_path>
<zookeeper>
	<node>
		<host>test01</host>
		<port>2181</port>
	</node>
	<node>
		<host>test02</host>
		<port>2181</port>
	</node>
	<node>
		<host>test03</host>
		<port>2181</port>
	</node>
</zookeeper>
```

Mk data directory

```
mkdir -p /data/clickhouse/
chown -R clickhouse:clickhouse /data/clickhouse/

scp /etc/clickhouse-server/config.xml test02:/etc/clickhouse-server/config.xml
scp /etc/clickhouse-server/config.xml test03:/etc/clickhouse-server/config.xml
```
	
To start the server as a daemon, run:

sudo clickhouse start
	
There are also another ways to run ClickHouse:

```
sudo service clickhouse-server start
sudo systemctl start clickhouse-server.service
sudo /etc/init.d/clickhouse-server start

sudo  systemctl restart clickhouse-server.service
```

See the logs in the /var/log/clickhouse-server/ directory.

After launching server, you can use the command-line client to connect to it:

clickhouse-client
	
By default, it connects to localhost:9000 on behalf of the user default without a password. It can also be used to connect to a remote server using --host argument.

```
clickhouse-client --port=19000 --host=test03
#sql
show clusters;
```

Creates a new database.

CREATE DATABASE testdb ON CLUSTER 'perftest_3shards_1replicas';

Navigation

Recent Posts

Friend Links