GPS日志分析案例

最新推荐文章于 2025-05-24 09:20:38 发布

原创最新推荐文章于 2025-05-24 09:20:38 发布 · 1.5k 阅读

0 ·

CC 4.0 BY-SA版权

笔记专栏收录该内容

2 篇文章

订阅专栏

本文介绍了一种基于GPS数据的处理流程，从数据采集到实时分析的完整方案。包括使用Python模拟数据、通过Nginx收集日志、Flume进行数据传输、HBase存储、Kafka消息队列、Spark Streaming实时计算速度以及最终结果存入Elasticsearch的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

项目架构

准备

数据源

GeoLife GPS Trajectories

python 模拟数据

随机读取数据源，发送到 nginx

import json
import os
import random
from itertools import islice

import requests

url = 'http://node3:8787/gps_logs'


# 随机 ip
def sample_ip():
    ip_slice_list = [10, 29, 30, 46, 55, 63, 72, 87, 98, 132, 156, 124, 167, 143, 187, 168, 190, 201, 202, 214,
                     215, 222]
    slice_ip = random.sample(ip_slice_list, 4)  # 从ip_slice_list中随机获取4个元素，作为一个片断返回
    return ".".join([str(item) for item in slice_ip])  # todo


def sample_type():
    # 无人机类型
    uav_type = [
        'MAVIC MINI',
        'PHANTOM 4',
        'INSPIRE',
        'INSPIRE1 悟 PRO',
        'PHANTOM RTK',
        'M600',
    ]
    return random.sample(uav_type, 1)


# 随机 打开一个 日志
dist_upon = random.randint(0, 181)
if 0 <= dist_upon < 10:
    dist_upon = '00' + str(dist_upon)
elif 10 <= dist_upon < 100:
    dist_upon = '0' + str(dist_upon)
else:
    dist_upon = str(dist_upon)
path = 'Geolife Trajectories 1.3/Data/' + dist_upon + '/Trajectory/'
name = ''
for _, _, files in os.walk(path):
    name = files[random.randint(0, len(files) - 1)]
path += name
print(path)
uav_type = sample_type()[0]
with open(path, 'r') as f:
    for line in islice(f, 6, None):
        data = {
            "id": name.split('.')[0],
            "type": uav_type,
            "data": line.strip()
        }
        # 发送 http 请求
        res = requests.post(url=url, data=json.dumps(data))
        print(res.text)
f.close()

Nginx

nginx 安装

lua等组件参考 https://www.cnblogs.com/52fhy/p/10164553.html

./configure --user=hdfs --group=hdfs --prefix=/home/hdfs/env/nginx --with-http_stub_status_module --with-http_ssl_module --with-http_v2_module --with-http_gzip_static_module --with-http_sub_module --add-module=/home/hdfs/env/lua/ngx_devel_kit-0.3.0 --add-module=/home/hdfs/env/lua/lua-nginx-module-0.10.13
make && make install

nginx 配置文件

http

log_format gps_logs_format escape=json '{"remote":$remote_addr,"msec":$msec,"http_host":$http_host,"request_uri":$request_uri,"request":$request_body}';

server

lua_need_request_body on;
        location = /gps_logs {
            content_by_lua_block {
                ngx.req.read_body()
                local data = ngx.req.get_body_data()
                ngx.say(data)
            }
            access_log /home/hdfs/env/nginx/logs/gps_logs/access.log gps_logs_format;
        }

日志切片

vim gps_log_slice.sh

#!/bin/bash
#设置日志文件存放目录
LOG_HOME="/home/hdfs/env/nginx/logs/gps_logs/"
#备分文件名称
LOG_PATH_BAK="$(date -d yesterday +%Y%m%d%H%M)".access.log
#重命名日志文件
mv ${LOG_HOME}/access.log ${LOG_HOME}/back/${LOG_PATH_BAK}
#向nginx主进程发信号重新打开日志
kill -USR1 `cat /home/hdfs/env/nginx/logs/nginx.pid`

开启定时任务

1 0 * * *  sh /home/hdfs/env/nginx/logs/gps_logs/gps_log_slice.sh

自动删除7天前的日志文件

touch auto-del-7-day-ago-log.sh
chmod +x auto-del-7-day-ago-log.sh
vim auto-del-7-day-ago-log.sh
find /home/hdfs/env/nginx/logs/gps_logs/back -mtime +7 -name "*access.log" -exec rm -rf {} \;
 
crontab -e
10 0 * * * /home/hdfs/env/nginx/logs/gps_logs/auto-del-7-day-ago-log.sh > /home/hdfs/env/nginx/logs/gps_logs/auto-del.out 2>&1

Flume

http://flume.apache.org/releases/content/1.6.0/FlumeUserGuide.html

Hbase

建表

create 'uav_gps_logs', 'uav_info'

Kafka

创建topic

kafka-topics.sh --zookeeper node2:2181,node3:2181,node4:2181  --create --topic uavgps --partitions 3 --replication-factor 3

消费topic

kafka-console-consumer.sh --bootstrap-server node1:9092,node2:9092,node3:9092 --topic uavgps

Log -> Flume -> Hbase & Kafka

hbase-flume.conf

# 从文件读取实时消息，不做处理直接存储到Hbase
agent.sources = logfile-source
agent.channels = file-channel
agent.sinks = hbase-sink

# logfile-source配置
agent.sources.logfile-source.type = exec
# source源为Lua脚本中生成的日志文件lualog.txt
agent.sources.logfile-source.command = tail -f /home/hdfs/env/nginx/logs/gps_logs/access.log
agent.sources.logfile-source.checkperiodic = 10

# defined channel,可以配置文件形式和内存形式,内存形式效率高但是出错会导致数据丢失,文件形式效率慢但是数据不会>丢失
#agent.channels.file-channel.type = memory
#设置channel的容量
#agent.channels.file-channel.capacity = 1000
#设置sink每次从channel中拉取的event的数量
#agent.channels.file-channel.transactionCapacity = 100

agent.channels.file-channel.type = file
agent.channels.file-channel.checkpointDir = /home/hdfs/tmp/flume-hbase-gps/checkpoint
agent.channels.file-channel.dataDirs = /home/hdfs/tmp/flume-hbase-gps/data


# sink 配置为 Hbase
agent.sinks.hbase-sink.type = org.apache.flume.sink.hbase.HBaseSink
# Habse表名和列族名
agent.sinks.hbase-sink.table = uav_gps_logs
agent.sinks.hbase-sink.columnFamily  = uav_info
agent.sinks.hbase-sink.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
#agent.sinks.hbase-sink.serializer.regex = (.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)\\\^A(.*?)
agent.sinks.hbase-sink.serializer.regex = ^(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$
# 列族中每一个列的名称,注意:row key为自动生成
agent.sinks.hbase-sink.serializer.colNames=remote_addr,msec,uav_id,line_id,uav_type,uav_status,latitude,longitude,altitude,uav_time

# 组合source、sink和channel
agent.sources.logfile-source.channels = file-channel
agent.sinks.hbase-sink.channel = file-channel

kafka-flume.conf

a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /home/hdfs/env/nginx/logs/gps_logs/access.log
a1.sources.r1.checkperiodic = 10

# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = uavgps
a1.sinks.k1.brokerList = node2:9092,node3:9092,node4:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

后台启动

flume hbase

nohup flume-ng agent -c conf -f gps-flume-hbase.conf -n agent -Dflume.root.logger=INFO,console 1>out-start-gps-hbase.out 2>&1 &

flume kafka

nohup flume-ng agent -c conf -f gps-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console 1>out-start-gps-kafka.out 2>&1 &

Hbase -> Hive

CREATE EXTERNAL TABLE IF NOT EXISTS gps.uav_gps
(key STRING, uav_id STRING, line_id STRING, uav_type STRING, uav_status TINYINT, latitude DOUBLE, longitude DOUBLE, altitude DOUBLE, uav_time STRING)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping"=":key,uav_info:uav_id,uav_info:line_id,uav_info:uav_type,uav_info:uav_status,uav_info:latitude,uav_info:longitude,uav_info:altitude,uav_info:uav_time")
TBLPROPERTIES ("hbase.table.name"="default:uav_gps_logs");

实时获取 GPS 信息，并计算速度

SparkStreaming + Kafka

package cn.yangxin.spark.stream.kafka

import java.text.SimpleDateFormat
import java.util
import java.util.{Calendar, Date}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}

import scala.collection.mutable

object UAVGPSRealTimeAnalyze {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test").setMaster("local")
    conf.set("es.nodes", "node2");
    conf.set("es.index.auto.create", "true");
    conf.set("es.mapping.id", "id");
    conf.set("es.port", "9200");
    // 构建Spark Streaming上下文
    val ssc = new StreamingContext(conf, Durations.seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    // checkpoint 保留计算信息
    ssc.checkpoint("./MyCheckpoint")

    val db: Int = 2;
    val topic: String = "uavgps"
    /**
     * 从Redis 中获取消费者offset
     */
    val currentTopicOffset: mutable.Map[String, String] = getOffSetFromRedis(db, topic)
    //初始读取到的topic offset:
    currentTopicOffset.foreach(tp=>{println(s" 初始读取到的offset: $tp")})
    //转换成需要的类型
    val fromOffsets: Predef.Map[TopicPartition, Long] = currentTopicOffset.map { resultSet =>
      new TopicPartition(topic, resultSet._1.toInt) -> resultSet._2.toLong
    }.toMap

    val kafkaParams: Map[String, Object] = Map[String, Object](
      "bootstrap.servers" -> "node1:9092,node2:9092,node3:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "MyGroupId-uavgps",
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> "false"
    )

    // 获取每五秒的流
    val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      PreferConsistent,
      ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
    )

    /**
     * 过滤掉状态为 不为 2 的 gps 信息（飞行速度）
     */

    val gpsLogFilter = stream.filter(line => {
      val f = line.value().matches("^(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$")
      f && line.value().split(",")(4).toInt == 2
    })

    /**
     * 读取 id status latitude longitude time
     */
    val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    val point: DStream[(String, (Double, Double, Date, String))] = gpsLogFilter.map(line => {
      val row: String = line.value()
      val split: Array[String] = row.split(",")
      (split(2), (split(5).toDouble, split(6).toDouble, format.parse(split(8)), "0.0"))
    })

    /**
     * 计算两个 经纬度之间的距离
     */
    def getDistance(lon1: Double, lat1: Double, lon2: Double, lat2: Double): Double = {
      // pi为π，r为地球半径
      val pi = Math.PI
      val r: Double = 6378137 // 赤道半径 m
      //a1、a2、b1、b2分别为上面数据的经纬度转换为弧度
      val a1 = lat1 * pi / 180.0
      val a2 = lon1 * pi / 180.0
      val b1 = lat2 * pi / 180.0
      val b2 = lon2 * pi / 180.0
      val t1: Double = Math.cos(a1) * Math.cos(a2) * Math.cos(b1) * Math.cos(b2)
      val t2: Double = Math.cos(a1) * Math.sin(a2) * Math.cos(b1) * Math.sin(b2)
      val t3: Double = Math.sin(a1) * Math.sin(b1)
      val distance = Math.acos(t1 + t2 + t3) * r
      distance
    }

    /**
     * 每5秒 在数据的最后10秒 计算 相邻两点的速度
     */
    point.reduceByKeyAndWindow(
      (t1: (Double, Double, Date, String), t2: (Double, Double, Date, String)) => {
        val distance_ = getDistance(t1._2, t1._1, t2._2, t2._1)
        val time_ = Math.abs(t2._3.getTime - t1._3.getTime) / 1000
        val speed_ = (distance_ / time_).formatted("%.3f")
        println(distance_, time_, speed_)
        (t2._1, t2._2, t2._3, speed_)
      }, Seconds(10), Seconds(5)).print()

    // offset存入redis
    stream.foreachRDD(rdd => {
      val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

      //LPF foreachPartition是遍历分区吗
      rdd.foreachPartition { iter =>
        val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
        println(s"topic:${o.topic}  partition:${o.partition}  fromOffset:${o.fromOffset}  untilOffset: ${o.untilOffset}")
      }
      saveOffsetToRedis(db, offsetRanges)
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }

  def getOffSetFromRedis(db: Int, topic: String) = {
    val jedis = RedisClient.pool.getResource
    jedis.select(db)
    val result: util.Map[String, String] = jedis.hgetAll(topic)
    RedisClient.pool.returnResource(jedis)
    if (result.size() == 0) {
      result.put("0", "0")
      result.put("1", "0")
      result.put("2", "0")
    }
    import scala.collection.JavaConversions.mapAsScalaMap
    val offsetMap: scala.collection.mutable.Map[String, String] = result
    offsetMap
  }

  /**
   * 将消费者offset 保存到 Redis中
   *
   */
  def saveOffsetToRedis(db: Int, offsetRanges: Array[OffsetRange]) = {
    val jedis = RedisClient.pool.getResource
    jedis.select(db)
    offsetRanges.foreach(one => {
      jedis.hset(one.topic, one.partition.toString, one.untilOffset.toString)
    })
    println("保存成功")
    RedisClient.pool.returnResource(jedis)
  }
}

Elasticsearch

解决脑裂问题

主节点
node.master: true  
node.data: false  
从节点
node.master: false  
node.data: true  
所有节点
discovery.zen.ping.multicast.enabled: false  
discovery.zen.ping.unicast.hosts: [“master”, “slave1”, “slave2"]  
discovery.zen.minimum_master_nodes: 2
# 开启脚本 （使用局部更新功能）
script.engine.groovy.inline.update: true

elasticsearch.yml

cluster.name: es-cluster	#集群名称，节点就靠这个名字认为自己是哪个集群
node.name: es-node1	#节点名称，集群中必须唯一
node.master: true	#是否具有成为主节点的资格
node.data: false
network.host: 0.0.0.0	#允许外网访问的地址
http.port: 9200	#如果是在一台服务器上装多个ES，要保证唯一，否则默认即可
transport.tcp.port: 9300 #如果在一台服务器上装多个ES，要保证唯一，否则默认即可
discovery.zen.ping.unicast.hosts: ["node1","node2","node3","node4"]	#集群中所有节点的通信地址，就是上面那个tcp地址
discovery.zen.minimum_master_nodes: 2	#防止脑裂，值为集群中节点总数的一半+1即可。
http.cors.enabled: true	#允许跨域请求，即允许其他插件访问
http.cors.allow-origin: "*"	#允许跨域请求的源
script.engine.groovy.inline.update: true

jvm.options

-Xms256m
-Xmx256m

/etc/sysctl.conf
vm.max_map_count=262144

sysctl -p # 刷新，从指定的文件加载系统参数，如不指定即从/etc/sysctl.conf中加载
sysctl -a|grep vm.max_map_count #查看当前系统的环境变量
sysctl -w vm.max_map_count=262144

ik pinyin 分词器

ik
拼音

Kinana

修改配置文件
config/kibana.yml

server.port: 5601 ##服务端口
server.host: "0.0.0.0" ## 允许外网访问的ip
elasticsearch.url: "http://node1:9200" ##elasticsearch 服务地址

将 DStream 写入 ES

提交任务

编写脚本 start-streaming.sh

jarlib="/home/hdfs/uav/lib"
CLASS_PATH="/home/hdfs/uav/start-streaming.sh"
for i in `ls $jarlib/*.jar`; do
CLASS_PATH="$CLASS_PATH","$i";
done

APP_MAINCLASS="cn.yangxin.spark.stream.kafka.UAVGPSRealTimeAnalyze"
APP_JAR="/home/hdfs/uav/uavstream.jar"
JAVA_CMD="spark-submit --master spark://node1:7077  --class $APP_MAINCLASS  --jars $CLASS_PATH --deploy-mode cluster $APP_JAR"
echo $JAVA_CMD
eval $JAVA_CMD

es https://www.jianshu.com/p/0e73958bfd70

Azkaban 安装

版本 4.0.0

下载 release 版本

安装 gradlew
编译

./gradlew clean build

将

azkaban-4.0.0/azkaban-exec-server/build/distributions/azkaban-exec-server-0.1.0-SNAPSHOT.tar.gz
azkaban-4.0.0/azkaban-db/build/distributions/azkaban-db-0.1.0-SNAPSHOT.tar.gz
azkaban-4.0.0/azkaban-web-server/build/distributions/azkaban-web-server-0.1.0-SNAPSHOT.tar.gz

放入node4节点

创建 ssl

keytool -keystore keystore -alias jetty -genkey -keyalg RSA

azkaban-web/conf/azkaban.properties

# Azkaban Personalization Settings
azkaban.name=My Azkaban
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=/home/hdfs/env/azkaban/azkaban-web/web/
default.timezone.id=Asia/Shanghai
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=/home/hdfs/env/azkaban/azkaban-web/conf/azkaban-users.xml
# Loader for projects
executor.global.properties=/home/hdfs/env/azkaban/azkaban-web/conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
jetty.keystore=/home/hdfs/env/azkaban/keystore #SSL 文件名
jetty.password=azkaban #SSL 文件密码
jetty.keypassword=azkaban #Jetty 主密码 与 keystore 文件相同
jetty.truststore=/home/hdfs/env/azkaban/keystore #SSL 文件名
jetty.trustpassword=azkaban # SSL 文件密码
# Azkaban Executor settings
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
executor.port=12321
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=node1
mysql.database=azkaban
mysql.user=root
mysql.password=123456
mysql.numconnections=100
#Multiple Executor
azkaban.use.multiple.executors=true
azkaban.executorselector.filters=StaticRemainingFlowSize,CpuStatus
azkaban.executorselector.comparator.NumberOfAssignedFlowComparator=1
azkaban.executorselector.comparator.Memory=1
azkaban.executorselector.comparator.LastDispatched=1
azkaban.executorselector.comparator.CpuUsage=1

azkaban-exec/conf/azkaban.properties

# Azkaban Personalization Settings
azkaban.name=My Azkaban
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=/home/hdfs/env/azkaban/azkaban-exec/web/
default.timezone.id=Asia/Shanghai
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=/home/hdfs/env/azkaban/azkaban-exec/conf/azkaban-users.xml
# Loader for projects
executor.global.properties=/home/hdfs/env/azkaban/azkaban-exec/conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Where the Azkaban web server is located
azkaban.webserver.url=http://localhost:8081
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban plugin settings
azkaban.jobtype.plugin.dir=/home/hdfs/env/azkaban/azkaban-exec/plugins/jobtypes
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=node1
mysql.database=azkaban
mysql.user=root
mysql.password=123456
mysql.numconnections=100
# Azkaban Executor settings
executor.maxThreads=50
executor.flow.threads=30
executor.props.resolve.overrideExisting.enabled=false
executor.port=12321

启动

启动 exec
./start-exec.sh
浏览器访问
http://node4:12321/executor?action=activate
启动 web
./start-web.sh
浏览器访问
http://node4:8081/index