diff --git a/internal/tools/cron/cron_task.go b/internal/tools/cron/cron_task.go index 7ae314193..a4de309d4 100644 --- a/internal/tools/cron/cron_task.go +++ b/internal/tools/cron/cron_task.go @@ -58,6 +58,11 @@ func Start(ctx context.Context, conf *Config, client discovery.Conn, service grp cm.Watch(ctx) } + locker, err := NewEtcdLocker(client.(*etcd.SvcDiscoveryRegistryImpl).GetClient()) + if err != nil { + return err + } + srv := &cronServer{ ctx: ctx, config: conf, @@ -65,6 +70,7 @@ func Start(ctx context.Context, conf *Config, client discovery.Conn, service grp msgClient: msg.NewMsgClient(msgConn), conversationClient: pbconversation.NewConversationClient(conversationConn), thirdClient: third.NewThirdClient(thirdConn), + locker: locker, } if err := srv.registerClearS3(); err != nil { @@ -81,6 +87,8 @@ func Start(ctx context.Context, conf *Config, client discovery.Conn, service grp log.ZDebug(ctx, "cron task server is running") <-ctx.Done() log.ZDebug(ctx, "cron task server is shutting down") + srv.cron.Stop() + return nil } @@ -91,6 +99,7 @@ type cronServer struct { msgClient msg.MsgClient conversationClient pbconversation.ConversationClient thirdClient third.ThirdClient + locker *EtcdLocker } func (c *cronServer) registerClearS3() error { @@ -98,7 +107,9 @@ func (c *cronServer) registerClearS3() error { log.ZInfo(c.ctx, "disable scheduled cleanup of s3", "fileExpireTime", c.config.CronTask.FileExpireTime, "deleteObjectType", c.config.CronTask.DeleteObjectType) return nil } - _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, c.clearS3) + _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, func() { + c.locker.ExecuteWithLock(c.ctx, "clearS3", c.clearS3) + }) return errs.WrapMsg(err, "failed to register clear s3 cron task") } @@ -107,11 +118,15 @@ func (c *cronServer) registerDeleteMsg() error { log.ZInfo(c.ctx, "disable scheduled cleanup of chat records", "retainChatRecords", c.config.CronTask.RetainChatRecords) return nil } - _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, c.deleteMsg) + _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, func() { + c.locker.ExecuteWithLock(c.ctx, "deleteMsg", c.deleteMsg) + }) return errs.WrapMsg(err, "failed to register delete msg cron task") } func (c *cronServer) registerClearUserMsg() error { - _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, c.clearUserMsg) + _, err := c.cron.AddFunc(c.config.CronTask.CronExecuteTime, func() { + c.locker.ExecuteWithLock(c.ctx, "clearUserMsg", c.clearUserMsg) + }) return errs.WrapMsg(err, "failed to register clear user msg cron task") } diff --git a/internal/tools/cron/dist_look.go b/internal/tools/cron/dist_look.go new file mode 100644 index 000000000..d1d2d1cb0 --- /dev/null +++ b/internal/tools/cron/dist_look.go @@ -0,0 +1,89 @@ +package cron + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/openimsdk/tools/log" + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/client/v3/concurrency" +) + +const ( + lockLeaseTTL = 300 +) + +type EtcdLocker struct { + client *clientv3.Client + instanceID string +} + +// NewEtcdLocker creates a new etcd distributed lock +func NewEtcdLocker(client *clientv3.Client) (*EtcdLocker, error) { + hostname, _ := os.Hostname() + pid := os.Getpid() + instanceID := fmt.Sprintf("%s-pid-%d-%d", hostname, pid, time.Now().UnixNano()) + + locker := &EtcdLocker{ + client: client, + instanceID: instanceID, + } + + return locker, nil +} + +func (e *EtcdLocker) ExecuteWithLock(ctx context.Context, taskName string, task func()) { + session, err := concurrency.NewSession(e.client, concurrency.WithTTL(lockLeaseTTL)) + if err != nil { + log.ZWarn(ctx, "Failed to create etcd session", err, + "taskName", taskName, + "instanceID", e.instanceID) + return + } + defer session.Close() + + lockKey := fmt.Sprintf("openim/crontask/%s", taskName) + mutex := concurrency.NewMutex(session, lockKey) + + ctxWithTimeout, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + + err = mutex.TryLock(ctxWithTimeout) + if err != nil { + if err == context.DeadlineExceeded { + log.ZDebug(ctx, "Task is being executed by another instance, skipping", + "taskName", taskName, + "instanceID", e.instanceID) + } else { + log.ZWarn(ctx, "Failed to acquire task lock", err, + "taskName", taskName, + "instanceID", e.instanceID) + } + return + } + + defer func() { + if err := mutex.Unlock(ctx); err != nil { + log.ZWarn(ctx, "Failed to release task lock", err, + "taskName", taskName, + "instanceID", e.instanceID) + } else { + log.ZInfo(ctx, "Successfully released task lock", + "taskName", taskName, + "instanceID", e.instanceID) + } + }() + + log.ZInfo(ctx, "Successfully acquired task lock, starting execution", + "taskName", taskName, + "instanceID", e.instanceID, + "sessionID", session.Lease()) + + task() + + log.ZInfo(ctx, "Task execution completed", + "taskName", taskName, + "instanceID", e.instanceID) +} diff --git a/start-config.yml b/start-config.yml index 1231b5d0d..da959044d 100644 --- a/start-config.yml +++ b/start-config.yml @@ -1,6 +1,6 @@ serviceBinaries: openim-api: 1 - openim-crontask: 1 + openim-crontask: 4 openim-rpc-user: 1 openim-msggateway: 1 openim-push: 8