GPS日志分析案例
项目架构
准备
数据源
python 模拟数据
随机读取数据源,发送到 nginx
import json
import os
import random
from itertools import islice
import requests
url = 'http://node3:8787/gps_logs'
# 随机 ip
def sample_ip():
ip_slice_list = [10, 29, 30, 46, 55, 63, 72, 87, 98, 132, 156, 124, 167, 143, 187, 168, 190, 201, 202, 214,
215, 222]
slice_ip = random.sample(ip_slice_list, 4) # 从ip_slice_list中随机获取4个元素,作为一个片断返回
return ".".join([str(item) for item in slice_ip]) # todo
def sample_type():
# 无人机类型
uav_type = [
'MAVIC MINI',
'PHANTOM 4',
'INSPIRE',
'INSPIRE1 悟 PRO',
'PHANTOM RTK',
'M600',
]
return random.sample(uav_type, 1)
# 随机 打开一个 日志
dist_upon = random.randint(0, 181)
if 0 <= dist_upon < 10:
dist_upon = '00' + str(dist_upon)
elif 10 <= dist_upon < 100:
dist_upon = '0' + str(dist_upon)
else:
dist_upon = str(dist_upon)
path = 'Geolife Trajectories 1.3/Data/' + dist_upon + '/Trajectory/'
name = ''
for _, _, files in os.walk(path):
name = files[random.randint(0, len(files) - 1)]
path += name
print(path)
uav_type = sample_type()[0]
with open(path, 'r') as f:
for line in islice(f, 6, None):
data = {
"id": name.split('.')[0],
"type": uav_type,
"data": line.strip()
}
# 发送 http 请求
res = requests.post(url=url, data=json.dumps(data))
print(res.text)
f.close()
Nginx
nginx 安装
lua等组件 参考 https://www.cnblogs.com/52fhy/p/10164553.html
./configure --user=hdfs --group=hdfs --prefix=/home/hdfs/env/nginx --with-http_stub_status_module --with-http_ssl_module --with-http_v2_module --with-http_gzip_static_module --with-http_sub_module --add-module=/home/hdfs/env/lua/ngx_devel_kit-0.3.0 --add-module=/home/hdfs/env/lua/lua-nginx-module-0.10.13
make && make install
nginx 配置文件
http
log_format gps_logs_format escape=json '{"remote":$remote_addr,"msec":$msec,"http_host":$http_host,"request_uri":$request_uri,"request":$request_body}';
server
lua_need_request_body on;
location = /gps_logs {
content_by_lua_block {
ngx.req.read_body()
local data = ngx.req.get_body_data()
ngx.say(data)
}
access_log /home/hdfs/env/nginx/logs/gps_logs/access.log gps_logs_format;
}
日志切片
vim gps_log_slice.sh
#!/bin/bash
#设置日志文件存放目录
LOG_HOME="/home/hdfs/env/nginx/logs/gps_logs/"
#备分文件名称
LOG_PATH_BAK="$(date -d yesterday +%Y%m%d%H%M)".access.log
#重命名日志文件
mv ${LOG_HOME}/access.log ${LOG_HOME}/back/${LOG_PATH_BAK}
#向nginx主进程发信号重新打开日志
kill -USR1 `cat /home/hdfs/env/nginx/logs/nginx.pid`
开启定时任务
1 0 * * * sh /home/hdfs/env/nginx/logs/gps_logs/gps_log_slice.sh
自动删除7天前的日志文件
touch auto-del-7-day-ago-log.sh
chmod +x auto-del-7-day-ago-log.sh
vim auto-del-7-day-ago-log.sh
find /home/hdfs/env/nginx/logs/gps_logs/back -mtime +7 -name "*access.log" -exec rm -rf {} \;
crontab -e
10 0 * * * /home/hdfs/env/nginx/logs/gps_logs/auto-del-7-day-ago-log.sh > /home/hdfs/env/nginx/logs/gps_logs/auto-del.out 2>&1
Flume
Hbase
建表
create 'uav_gps_logs', 'uav_info'
Kafka
创建topic
kafka-topics.sh --zookeeper node2:2181,node3:2181,node4:2181 --create --topic uavgps --partitions 3 --replication-factor 3
消费topic
kafka-console-consumer.sh --bootstrap-server node1:9092,node2:9092,node3:9092 --topic uavgps
Log -> Flume -> Hbase & Kafka
hbase-flume.conf
# 从文件读取实时消息,不做处理直接存储到Hbase
agent.sources = logfile-source
agent.channels = file-channel
agent.sinks = hbase-sink
# logfile-source配置
agent.sources.logfile-source.type = exec
# source源为Lua脚本中生成的日志文件lualog.txt
agent.sources.logfile-source.command = tail -f /home/hdfs/env/nginx/logs/gps_logs/access.log
agent.sources.logfile-source.checkperiodic = 10
# defined channel,可以配置文件形式和内存形式,内存形式效率高但是出错会导致数据丢失,文件形式效率慢但是数据不会>丢失
#agent.channels.file-channel.type = memory
#设置channel的容量
#agent.channels.file-channel.capacity = 1000
#设置sink每次从channel中拉取的event的数量
#agent.channels.file-channel.transactionCapacity = 100
agent.channels.file-channel.type = file
agent.channels.file-channel.checkpointDir = /home/hdfs/tmp/flume-hbase-gps/checkpoint
agent.channels.file-channel.dataDirs = /home/hdfs/tmp/flume-hbase-gps/data
# sink 配置为 Hbase
agent.sinks.hbase-sink.type = org.apache.flume.sink.hbase.HBaseSink
# Habse表名和列族名
agent.sinks.hbase-sink.table = uav_gps_logs
agent.sinks.hbase-sink.columnFamily = uav_info
agent.sinks.hbase-sink.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
#agent.sinks.hbase-sink.serializer.regex = (.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)
agent.sinks.hbase-sink.serializer.regex = ^(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$
# 列族中每一个列的名称,注意:row key为自动生成
agent.sinks.hbase-sink.serializer.colNames=remote_addr,msec,uav_id,line_id,uav_type,uav_status,latitude,longitude,altitude,uav_time
# 组合source、sink和channel
agent.sources.logfile-source.channels = file-channel
agent.sinks.hbase-sink.channel = file-channel
kafka-flume.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /home/hdfs/env/nginx/logs/gps_logs/access.log
a1.sources.r1.checkperiodic = 10
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = uavgps
a1.sinks.k1.brokerList = node2:9092,node3:9092,node4:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
后台启动
- flume hbase
nohup flume-ng agent -c conf -f gps-flume-hbase.conf -n agent -Dflume.root.logger=INFO,console 1>out-start-gps-hbase.out 2>&1 &
- flume kafka
nohup flume-ng agent -c conf -f gps-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console 1>out-start-gps-kafka.out 2>&1 &
Hbase -> Hive
CREATE EXTERNAL TABLE IF NOT EXISTS gps.uav_gps
(key STRING, uav_id STRING, line_id STRING, uav_type STRING, uav_status TINYINT, latitude DOUBLE, longitude DOUBLE, altitude DOUBLE, uav_time STRING)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping"=":key,uav_info:uav_id,uav_info:line_id,uav_info:uav_type,uav_info:uav_status,uav_info:latitude,uav_info:longitude,uav_info:altitude,uav_info:uav_time")
TBLPROPERTIES ("hbase.table.name"="default:uav_gps_logs");
实时获取 GPS 信息,并计算速度
SparkStreaming + Kafka
package cn.yangxin.spark.stream.kafka
import java.text.SimpleDateFormat
import java.util
import java.util.{Calendar, Date}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}
import scala.collection.mutable
object UAVGPSRealTimeAnalyze {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test").setMaster("local")
conf.set("es.nodes", "node2");
conf.set("es.index.auto.create", "true");
conf.set("es.mapping.id", "id");
conf.set("es.port", "9200");
// 构建Spark Streaming上下文
val ssc = new StreamingContext(conf, Durations.seconds(5))
ssc.sparkContext.setLogLevel("ERROR")
// checkpoint 保留计算信息
ssc.checkpoint("./MyCheckpoint")
val db: Int = 2;
val topic: String = "uavgps"
/**
* 从Redis 中获取消费者offset
*/
val currentTopicOffset: mutable.Map[String, String] = getOffSetFromRedis(db, topic)
//初始读取到的topic offset:
currentTopicOffset.foreach(tp=>{println(s" 初始读取到的offset: $tp")})
//转换成需要的类型
val fromOffsets: Predef.Map[TopicPartition, Long] = currentTopicOffset.map { resultSet =>
new TopicPartition(topic, resultSet._1.toInt) -> resultSet._2.toLong
}.toMap
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "node1:9092,node2:9092,node3:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "MyGroupId-uavgps",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false"
)
// 获取每五秒的流
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
PreferConsistent,
ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
)
/**
* 过滤掉状态为 不为 2 的 gps 信息(飞行速度)
*/
val gpsLogFilter = stream.filter(line => {
val f = line.value().matches("^(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$")
f && line.value().split(",")(4).toInt == 2
})
/**
* 读取 id status latitude longitude time
*/
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val point: DStream[(String, (Double, Double, Date, String))] = gpsLogFilter.map(line => {
val row: String = line.value()
val split: Array[String] = row.split(",")
(split(2), (split(5).toDouble, split(6).toDouble, format.parse(split(8)), "0.0"))
})
/**
* 计算两个 经纬度之间的距离
*/
def getDistance(lon1: Double, lat1: Double, lon2: Double, lat2: Double): Double = {
// pi为π,r为地球半径
val pi = Math.PI
val r: Double = 6378137 // 赤道半径 m
//a1、a2、b1、b2分别为上面数据的经纬度转换为弧度
val a1 = lat1 * pi / 180.0
val a2 = lon1 * pi / 180.0
val b1 = lat2 * pi / 180.0
val b2 = lon2 * pi / 180.0
val t1: Double = Math.cos(a1) * Math.cos(a2) * Math.cos(b1) * Math.cos(b2)
val t2: Double = Math.cos(a1) * Math.sin(a2) * Math.cos(b1) * Math.sin(b2)
val t3: Double = Math.sin(a1) * Math.sin(b1)
val distance = Math.acos(t1 + t2 + t3) * r
distance
}
/**
* 每5秒 在数据的最后10秒 计算 相邻两点的速度
*/
point.reduceByKeyAndWindow(
(t1: (Double, Double, Date, String), t2: (Double, Double, Date, String)) => {
val distance_ = getDistance(t1._2, t1._1, t2._2, t2._1)
val time_ = Math.abs(t2._3.getTime - t1._3.getTime) / 1000
val speed_ = (distance_ / time_).formatted("%.3f")
println(distance_, time_, speed_)
(t2._1, t2._2, t2._3, speed_)
}, Seconds(10), Seconds(5)).print()
// offset存入redis
stream.foreachRDD(rdd => {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//LPF foreachPartition是遍历分区吗
rdd.foreachPartition { iter =>
val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
println(s"topic:${o.topic} partition:${o.partition} fromOffset:${o.fromOffset} untilOffset: ${o.untilOffset}")
}
saveOffsetToRedis(db, offsetRanges)
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
def getOffSetFromRedis(db: Int, topic: String) = {
val jedis = RedisClient.pool.getResource
jedis.select(db)
val result: util.Map[String, String] = jedis.hgetAll(topic)
RedisClient.pool.returnResource(jedis)
if (result.size() == 0) {
result.put("0", "0")
result.put("1", "0")
result.put("2", "0")
}
import scala.collection.JavaConversions.mapAsScalaMap
val offsetMap: scala.collection.mutable.Map[String, String] = result
offsetMap
}
/**
* 将消费者offset 保存到 Redis中
*
*/
def saveOffsetToRedis(db: Int, offsetRanges: Array[OffsetRange]) = {
val jedis = RedisClient.pool.getResource
jedis.select(db)
offsetRanges.foreach(one => {
jedis.hset(one.topic, one.partition.toString, one.untilOffset.toString)
})
println("保存成功")
RedisClient.pool.returnResource(jedis)
}
}
Elasticsearch
解决脑裂问题
主节点
node.master: true
node.data: false
从节点
node.master: false
node.data: true
所有节点
discovery.zen.ping.multicast.enabled: false
discovery.zen.ping.unicast.hosts: [“master”, “slave1”, “slave2"]
discovery.zen.minimum_master_nodes: 2
# 开启脚本 (使用局部更新功能)
script.engine.groovy.inline.update: true
elasticsearch.yml
cluster.name: es-cluster #集群名称,节点就靠这个名字认为自己是哪个集群
node.name: es-node1 #节点名称,集群中必须唯一
node.master: true #是否具有成为主节点的资格
node.data: false
network.host: 0.0.0.0 #允许外网访问的地址
http.port: 9200 #如果是在一台服务器上装多个ES,要保证唯一,否则默认即可
transport.tcp.port: 9300 #如果在一台服务器上装多个ES,要保证唯一,否则默认即可
discovery.zen.ping.unicast.hosts: ["node1","node2","node3","node4"] #集群中所有节点的通信地址,就是上面那个tcp地址
discovery.zen.minimum_master_nodes: 2 #防止脑裂,值为集群中节点总数的一半+1即可。
http.cors.enabled: true #允许跨域请求,即允许其他插件访问
http.cors.allow-origin: "*" #允许跨域请求的源
script.engine.groovy.inline.update: true
jvm.options
-Xms256m
-Xmx256m
/etc/sysctl.conf
vm.max_map_count=262144
sysctl -p # 刷新,从指定的文件加载系统参数,如不指定即从/etc/sysctl.conf中加载
sysctl -a|grep vm.max_map_count #查看当前系统的环境变量
sysctl -w vm.max_map_count=262144
ik pinyin 分词器
Kinana
- 修改配置文件
config/kibana.yml
server.port: 5601 ##服务端口
server.host: "0.0.0.0" ## 允许外网访问的ip
elasticsearch.url: "http://node1:9200" ##elasticsearch 服务地址
将 DStream 写入 ES
提交任务
编写脚本 start-streaming.sh
jarlib="/home/hdfs/uav/lib"
CLASS_PATH="/home/hdfs/uav/start-streaming.sh"
for i in `ls $jarlib/*.jar`; do
CLASS_PATH="$CLASS_PATH","$i";
done
APP_MAINCLASS="cn.yangxin.spark.stream.kafka.UAVGPSRealTimeAnalyze"
APP_JAR="/home/hdfs/uav/uavstream.jar"
JAVA_CMD="spark-submit --master spark://node1:7077 --class $APP_MAINCLASS --jars $CLASS_PATH --deploy-mode cluster $APP_JAR"
echo $JAVA_CMD
eval $JAVA_CMD
es https://www.jianshu.com/p/0e73958bfd70
Azkaban 安装
下载 release 版本
安装 gradlew
编译
./gradlew clean build
将
- azkaban-4.0.0/azkaban-exec-server/build/distributions/azkaban-exec-server-0.1.0-SNAPSHOT.tar.gz
- azkaban-4.0.0/azkaban-db/build/distributions/azkaban-db-0.1.0-SNAPSHOT.tar.gz
- azkaban-4.0.0/azkaban-web-server/build/distributions/azkaban-web-server-0.1.0-SNAPSHOT.tar.gz
放入node4节点
创建 ssl
keytool -keystore keystore -alias jetty -genkey -keyalg RSA
azkaban-web/conf/azkaban.properties
# Azkaban Personalization Settings
azkaban.name=My Azkaban
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=/home/hdfs/env/azkaban/azkaban-web/web/
default.timezone.id=Asia/Shanghai
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=/home/hdfs/env/azkaban/azkaban-web/conf/azkaban-users.xml
# Loader for projects
executor.global.properties=/home/hdfs/env/azkaban/azkaban-web/conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
jetty.keystore=/home/hdfs/env/azkaban/keystore #SSL 文件名
jetty.password=azkaban #SSL 文件密码
jetty.keypassword=azkaban #Jetty 主密码 与 keystore 文件相同
jetty.truststore=/home/hdfs/env/azkaban/keystore #SSL 文件名
jetty.trustpassword=azkaban # SSL 文件密码
# Azkaban Executor settings
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
executor.port=12321
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=node1
mysql.database=azkaban
mysql.user=root
mysql.password=123456
mysql.numconnections=100
#Multiple Executor
azkaban.use.multiple.executors=true
azkaban.executorselector.filters=StaticRemainingFlowSize,CpuStatus
azkaban.executorselector.comparator.NumberOfAssignedFlowComparator=1
azkaban.executorselector.comparator.Memory=1
azkaban.executorselector.comparator.LastDispatched=1
azkaban.executorselector.comparator.CpuUsage=1
azkaban-exec/conf/azkaban.properties
# Azkaban Personalization Settings
azkaban.name=My Azkaban
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=/home/hdfs/env/azkaban/azkaban-exec/web/
default.timezone.id=Asia/Shanghai
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=/home/hdfs/env/azkaban/azkaban-exec/conf/azkaban-users.xml
# Loader for projects
executor.global.properties=/home/hdfs/env/azkaban/azkaban-exec/conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Where the Azkaban web server is located
azkaban.webserver.url=http://localhost:8081
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban plugin settings
azkaban.jobtype.plugin.dir=/home/hdfs/env/azkaban/azkaban-exec/plugins/jobtypes
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=node1
mysql.database=azkaban
mysql.user=root
mysql.password=123456
mysql.numconnections=100
# Azkaban Executor settings
executor.maxThreads=50
executor.flow.threads=30
executor.props.resolve.overrideExisting.enabled=false
executor.port=12321
启动
-
启动 exec
./start-exec.sh
-
浏览器访问
http://node4:12321/executor?action=activate
-
启动 web
./start-web.sh
-
浏览器访问
http://node4:8081/index