Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
E
enoch
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
DevOps
enoch
Commits
e9cfb095
Commit
e9cfb095
authored
Dec 16, 2019
by
jingbo.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
监控节点的健康状态,并且dingding告警
parent
a9ddbfc4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
84 additions
and
215 deletions
+84
-215
main.go
main.go
+6
-4
global.go
pkg/global/global.go
+7
-2
node_check.go
pkg/node-check/node_check.go
+69
-205
node_check_test.go
pkg/node-check/node_check_test.go
+2
-4
No files found.
main.go
View file @
e9cfb095
...
...
@@ -2,12 +2,13 @@ package main
import
(
"fmt"
api_server
"git.quantgroup.cn/DevOps/enoch/pkg/api-server"
"git.quantgroup.cn/DevOps/enoch/pkg/api-server"
"git.quantgroup.cn/DevOps/enoch/pkg/dao"
"git.quantgroup.cn/DevOps/enoch/pkg/global"
"git.quantgroup.cn/DevOps/enoch/pkg/glog"
node_check
"git.quantgroup.cn/DevOps/enoch/pkg/node-check"
"git.quantgroup.cn/DevOps/enoch/pkg/points"
report_form
"git.quantgroup.cn/DevOps/enoch/pkg/report-form"
"git.quantgroup.cn/DevOps/enoch/pkg/report-form"
"github.com/Shopify/sarama"
_
"github.com/mkevac/debugcharts"
"github.com/valyala/fasthttp"
...
...
@@ -79,6 +80,7 @@ func handlerKafkaMsg() {
}
}
//TODO 可优化成Raft算法,目前是固定节点的
func
isMaster
()
bool
{
//开发环境,但InfluxDb的Ip是生产环境ip的时候,不执行初始化操作
if
global
.
IsDev
()
&&
strings
.
Contains
(
global
.
InfluxDbAddress
,
"172.16"
)
{
...
...
@@ -116,10 +118,10 @@ func main() {
report_form
.
RegularReport
(
global
.
ReportFormDir
)
//每周1早10点发送邮件
report_form
.
RegularMail
(
global
.
ReportFormDir
)
//节点健康状态检查
node_check
.
NodeHealthCheckAndNotify
()
//TODO 告警策略
//TODO node状态监控
}
//对外api
...
...
pkg/global/global.go
View file @
e9cfb095
...
...
@@ -81,8 +81,11 @@ func init() {
}
kvMap
:=
make
(
map
[
string
]
string
)
for
k
,
v
:=
range
result
.
Details
{
if
_
,
ok
:=
v
.
(
string
);
ok
{
kvMap
[
k
]
=
v
.
(
string
)
if
vString
,
ok
:=
v
.
(
string
);
ok
{
kvMap
[
k
]
=
vString
}
if
vFloat64
,
ok
:=
v
.
(
float64
);
ok
{
kvMap
[
k
]
=
strconv
.
Itoa
(
int
(
vFloat64
))
}
}
Config
.
RefreshKvMap
(
kvMap
)
...
...
@@ -122,11 +125,13 @@ func init() {
Logger
.
Error
(
"get must conf error: application consul.datacenter"
)
}
else
{
ConsulDc
=
consulDc
Logger
.
Debug
(
"consul dc"
,
ConsulDc
)
}
if
consulAddress
,
ok
:=
Config
.
Get
(
NamespaceApplication
,
"consul.address"
);
!
ok
{
Logger
.
Error
(
"get must conf error: application consul.address"
)
}
else
{
ConsulAddress
=
consulAddress
Logger
.
Debug
(
"consul address"
,
ConsulAddress
)
}
consulCluster
:=
strings
.
Split
(
ConsulAddress
,
","
)
if
e
:=
registry
.
Init
(
"consul"
,
map
[
string
]
interface
{}{
"dc"
:
ConsulDc
,
"cluster"
:
consulCluster
});
e
!=
nil
{
...
...
pkg/node-check/node_check.go
View file @
e9cfb095
package
node_check
func
NodeHealthCheckAndNotify
()
{
}
/*
import
(
"encoding/json"
"fmt"
"git.quantgroup.cn/DevOps/enoch/pkg/global"
"git.quantgroup.cn/DevOps/enoch/pkg/glog"
"github.com/vrg0/go-common/notify"
"github.com/vrg0/go-common/registry"
"strings"
"sync"
)
const (
Passing = "passing"
Critical = "critical"
var
(
notifyDingDing
=
newNotifyDingDing
()
)
//节点健康状态检查和告警
func
newNotifyDingDing
()
*
notify
.
Notify
{
cfgStr
:=
global
.
Config
.
GetOrDefault
(
global
.
NamespaceApplication
,
"notify.dingding"
,
`[]`
)
dstList
:=
make
([]
string
,
0
)
_
=
json
.
Unmarshal
([]
byte
(
cfgStr
),
&
dstList
)
return
notify
.
New
(
dstList
)
}
func
NodeHealthCheckAndNotify
()
{
//设置观察者节点
if err := registry.SetObserver(
&watch{}
); err != nil {
if
err
:=
registry
.
SetObserver
(
newWatch
()
);
err
!=
nil
{
glog
.
Info
(
"设置观察者节点失败"
,
err
)
return
}
}
//registry观察者,观察服务的状态,当节点挂掉时告警,当服务挂掉时告警
type
watch
struct
{
serviceMap
map
[
string
]
*
registry
.
Service
serviceMapLock
*
sync
.
Mutex
}
func
newWatch
()
*
watch
{
return
&
watch
{
serviceMap
:
make
(
map
[
string
]
*
registry
.
Service
),
serviceMapLock
:
new
(
sync
.
Mutex
),
}
}
func
(
w
watch
)
DeleteService
(
serviceName
string
)
{
w
.
serviceMapLock
.
Lock
()
defer
w
.
serviceMapLock
.
Unlock
()
//服务下线
glog
.
Info
(
"服务下线:"
,
serviceName
)
//删除服务
...
...
@@ -43,220 +58,69 @@ func (w watch) UpdateNodes(service *registry.Service) {
w
.
serviceMapLock
.
Lock
()
defer
w
.
serviceMapLock
.
Unlock
()
if s, ok := w.serviceMap[service.Name]; !ok {
glog.Info("服务上线:", service.Name)
} else {
for id, node := range service.NodeMap {
if node.Status == Critical {
}
}
}
//状态检查&告警
w
.
statusCheckAndNotify
(
service
)
//更新
服务
状态
//更新
本地缓存的节点
状态
w
.
serviceMap
[
service
.
Name
]
=
service
}
*/
/*
type watch struct{}
func (w watch) DeleteService(serviceName string) {
//pass
servicesStatusLock.Lock()
defer servicesStatusLock.Unlock()
delete(servicesStatus, serviceName)
}
func (w watch) UpdateNodes(service *registry.Service) {
servicesStatusLock.Lock()
defer servicesStatusLock.Unlock()
//单个节点挂了告警
if oldService, ok := servicesStatus[service.Name]; ok {
func
(
w
watch
)
statusCheckAndNotify
(
service
*
registry
.
Service
)
{
//服务信息初始化
//服务不存在
if
_
,
ok
:=
w
.
serviceMap
[
service
.
Name
];
!
ok
{
glog
.
Info
(
"服务信息初始化:"
,
service
.
Name
)
return
}
//服务存在,但所有节点都是critical 或 没有节点
if
service
,
ok
:=
w
.
serviceMap
[
service
.
Name
];
ok
{
allNodeCritical
:=
true
for
_
,
node
:=
range
service
.
NodeMap
{
if oldNode, ok := oldService.NodeMap[node.Id]; ok {
if oldNode.Status == Passing && node.Status == Critical {
logger.Warning.Print(service.Name, " ", node.Id, "---!!!node critical!!!---")
if _, ok := IgnoreServiceMap[service.Name]; !ok {
_ = dingding.SenderDingDing(service.Name+" "+node.Id+" "+"---!!!node critical!!!---", dingding.DefaultDingURL)
}
}
if
node
.
Status
==
registry
.
Passing
{
allNodeCritical
=
false
break
}
}
}
//整个服务挂了告警
//如果 服务存在,并且服务的old状态为passing,并且服务的now状态为critical,则报警,否贼记录服务状态
serviceString := serviceStr(service)
if oldService, ok := servicesStatus[service.Name]; ok && serviceStatus(oldService) && !serviceStatus(service) {
logger.Warning.Print(serviceString, "---!!!service critical!!!---")
if _, ok := IgnoreServiceMap[service.Name]; !ok {
_ = dingding.SenderDingDing(serviceString+"---!!!service critical!!!---", dingding.DefaultDingURL)
if
allNodeCritical
{
glog
.
Info
(
"服务信息初始化:"
,
service
.
Name
)
return
}
handler(service.Name)
} else {
logger.Info.Print(serviceString)
}
//更新服务状态
//深拷贝对象
newService := registry.NewService(service.Name)
for kk, vv := range service.NodeMap {
newNode := registry.Node{
ServiceName:vv.ServiceName,
Id:vv.Id,
Port:vv.Port,
Address:vv.Address,
Status:vv.Status,
}
for x, y := range vv.Meta {
newNode.Meta[x] = y
}
newService.NodeMap[kk] = &newNode
}
servicesStatus[service.Name] = newService
}
*/
/*
//节点的健康状态告警
func NodeCheck() {
//服务状态初始化
InitServiceStatus()
//设置观察者节点
if e := registry.SetObserver(&watch{}); e != nil {
glog.Info("设置观察者节点失败", e)
return
}
}
func InitServiceStatus() {
if s, ok := registry.GetServiceMap(); ok {
servicesStatus = s
}
}
*/
/*
var HandlerMap = new(sync.Map)
var HttpGetRetryCount = 3
var HttpTimeOut = time.Second * 10
var IgnoreServiceMap = make(map[string]struct{})
func init() {
//TODO 灵活配置
HandlerMap.Store("aaaa", "http://www.baidasdfasdfasdfasdfu.com/")
HandlerMap.Store("heimdallr", "http://172.20.6.33:8989/service-down")
// HandlerMap.Store("aaaa", "http://172.20.6.33:8989/service-down")
//TODO 灵活配置
IgnoreServiceMap["vcc-talos"] = struct{}{}
IgnoreServiceMap["aaaa"] = struct{}{}
}
func httpGet(url string, timeout time.Duration) (*http.Response, error) {
ctx, cancel := context.WithCancel(context.TODO())
req, e := http.NewRequest("GET", url, nil)
if e != nil {
return nil, e
} else {
req = req.WithContext(ctx)
}
_ = time.AfterFunc(timeout, func() {
cancel()
})
return http.DefaultClient.Do(req)
}
func handler(serviceName string) {
if url, ok := HandlerMap.Load(serviceName); ok {
for i := 0; i < HttpGetRetryCount; i++ {
if resp, e := httpGet(url.(string), HttpTimeOut); e != nil {
logger.Error.Print(" handler service: ", serviceName, " ", e)
} else {
logger.Info.Print(" handler service: ", serviceName, " ", resp.StatusCode)
//服务挂掉告警(服务的全部节点都挂掉)
if
len
(
service
.
NodeMap
)
!=
0
{
serviceUp
:=
false
for
_
,
node
:=
range
service
.
NodeMap
{
if
node
.
Status
==
registry
.
Passing
{
serviceUp
=
true
break
}
}
} else {
logger.Info.Print(" handler service: ", serviceName, " ", "not found handler hook api")
}
}
func serviceStatus(service *registry.Service) bool {
for _, node := range service.NodeMap {
if node.Status == Passing {
return true
if
!
serviceUp
{
sb
:=
new
(
strings
.
Builder
)
sb
.
WriteString
(
fmt
.
Sprintf
(
"服务健康状态异常:%s"
,
service
.
Name
))
for
_
,
node
:=
range
service
.
NodeMap
{
sb
.
WriteString
(
fmt
.
Sprintf
(
" %s:%s"
,
node
.
Id
,
node
.
Status
))
}
sb
.
WriteString
(
"
\n
"
)
glog
.
Warn
(
sb
.
String
())
notifyDingDing
.
SendText
(
sb
.
String
())
return
}
}
return false
}
func serviceStr(service *registry.Service) string {
rtn := service.Name + " "
//节点挂掉告警(服务的部分节点挂掉)
for
_
,
node
:=
range
service
.
NodeMap
{
rtn += node.Id + ":" + node.Status + " "
}
return rtn
}
func (w watch) AddNode(node *registry.Node) {
//pass
}
func (w watch) DelNode(node *registry.Node) {
//pass
}
var (
servicesStatus = make(map[string]*registry.Service)
servicesStatusLock = new(sync.Mutex)
)
func InitServiceStatus() {
servicesStatus = registry.GetServiceMap()
}
func NodeCheck() {
defer func() {
if e := recover(); e != nil {
logger.Info.Print("node check panic: ", e)
_ = dingding.SenderDingDing("node check panic!", dingding.DefaultDingURL)
time.Sleep(time.Second * 1)
NodeCheck()
//如果节点状态从passing变为其他,则告警
if
oldService
,
ok
:=
w
.
serviceMap
[
node
.
ServiceName
];
ok
{
if
oldNode
,
ok
:=
oldService
.
NodeMap
[
node
.
Id
];
ok
{
if
oldNode
.
Status
==
registry
.
Passing
&&
node
.
Status
!=
registry
.
Passing
{
s
:=
fmt
.
Sprintf
(
"服务节点健康状态异常:%s %s:%s"
,
service
.
Name
,
node
.
Id
,
node
.
Status
)
glog
.
Warn
(
s
)
notifyDingDing
.
SendText
(
s
)
}
}
}
}()
//注册器初始化
dc := "3c"
cluster := []string{"172.30.12.2:8500", "172.30.12.3:8500", "172.30.12.4:8500"}
if e := registry.Init("consul", map[string]interface{}{"dc": dc, "cluster": cluster}); e != nil {
logger.Info.Print("registry init error:", e)
os.Exit(-1)
}
time.Sleep(time.Second * 1)
//服务状态初始化
InitServiceStatus()
//设置观察者
if e := registry.SetObserver("watch", &watch{}); e != nil {
logger.Info.Print("set observer error:", e)
os.Exit(-1)
}
select {}
}
*/
pkg/node-check/node_check_test.go
View file @
e9cfb095
package
node_check
/*
import
"testing"
func
TestNodeCheck
(
t
*
testing
.
T
)
{
go NodeCheck
()
NodeHealthCheckAndNotify
()
select{}
select
{}
}
*/
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment