手?jǐn)]Golang 基本數(shù)據(jù)結(jié)構(gòu)與算法 k-means聚類算法
緣起
最近閱讀<<我的第一本算法書>>(【日】石田保輝;宮崎修一)
本系列筆記擬采用golang練習(xí)之
k-means聚類算法
聚類就是在輸入為多個(gè)數(shù)據(jù)時(shí), 將“相似”的數(shù)據(jù)分為一組的操作。 k-means算法是聚類算法中的一種。 首先隨機(jī)選擇k個(gè)點(diǎn)作為簇的中心點(diǎn), 然后重復(fù)執(zhí)行“將數(shù)據(jù)分到相應(yīng)的簇中”和 “將中心點(diǎn)移到重心的位置”這兩個(gè)操作, 直到中心點(diǎn)不再發(fā)生變化為止。 k-means算法中,隨著操作的不斷重復(fù), 中心點(diǎn)的位置必定會在某處收斂, 這一點(diǎn)已經(jīng)在數(shù)學(xué)層面上得到證明。 摘自 <<我的第一本算法書
【日】石田保輝;宮崎修一場景
- 某地突然爆發(fā)新冠疫情, 現(xiàn)防疫急需根據(jù)病例分布, 查找可能的病源地
- 首先將病例分布的坐標(biāo), 錄入系統(tǒng)
- 然后根據(jù)k-means算法, 按k從1到3, 分別進(jìn)行聚類
- 聚類的中心點(diǎn), 可能就是病源地
流程
- 給定若干樣本, 和樣本距離計(jì)算器, 需要求解k個(gè)樣本中心點(diǎn)
- 首先從樣本中隨機(jī)抽取k個(gè)點(diǎn), 作為中心點(diǎn)
-
循環(huán)每個(gè)樣本
- 分別計(jì)算樣本點(diǎn)到k個(gè)中心點(diǎn)的距離
- 判斷距離樣本點(diǎn)最小的中心點(diǎn)
- 將樣本劃分到該最小中心點(diǎn)的簇
-
計(jì)算每個(gè)簇的中心點(diǎn), 作為新的中心點(diǎn)
- 循環(huán)簇中的每個(gè)樣本
- 計(jì)算該樣本, 到本簇其他樣本的距離之和
- 與其他樣本的距離和最小的點(diǎn), 就是新的中心點(diǎn)
- 重復(fù)3-4, 直到中心點(diǎn)不再變化, 計(jì)算完畢
設(shè)計(jì)
- IPoint: 樣本點(diǎn)接口, 其實(shí)是一個(gè)空接口
- IDistanceCalculator: 距離計(jì)算器接口
- IClassifier: 分類器接口, 將samples聚類成k個(gè), 并返回k個(gè)中心點(diǎn)
- tPerson: 病例樣本點(diǎn), 實(shí)現(xiàn)IPoint接口, 含x,y坐標(biāo)
- tPersonDistanceCalculator: 病例距離計(jì)算器, 計(jì)算兩點(diǎn)間x,y坐標(biāo)的直線距離
- tKMeansClassifier: k-means聚類器, 實(shí)現(xiàn)IClassifier接口.
單元測試
k_means_test.go
- package others
- import (
- km "learning/gooop/others/k_means"
- "strings"
- "testing"
- )
- func Test_KMeans(t *testing.T) {
- // 創(chuàng)建樣本點(diǎn)
- samples := []km.IPoint {
- km.NewPerson(2, 11),
- km.NewPerson(2, 8),
- km.NewPerson(2, 6),
- km.NewPerson(3, 12),
- km.NewPerson(3, 10),
- km.NewPerson(4, 7),
- km.NewPerson(4, 3),
- km.NewPerson(5, 11),
- km.NewPerson(5, 9),
- km.NewPerson(5, 2),
- km.NewPerson(7, 9),
- km.NewPerson(7, 6),
- km.NewPerson(7, 3),
- km.NewPerson(8, 12),
- km.NewPerson(9, 3),
- km.NewPerson(9, 5),
- km.NewPerson(9, 10),
- km.NewPerson(10, 3),
- km.NewPerson(10, 6),
- km.NewPerson(10, 12),
- km.NewPerson(11, 9),
- }
- fnPoints2String := func(points []km.IPoint) string {
- items := make([]string, len(points))
- for i,it := range points {
- items[i] = it.String()
- }
- return strings.Join(items, " ")
- }
- for k:=1;k<=3;k++ {
- centers := km.KMeansClassifier.Classify(samples, km.PersonDistanceCalculator, k)
- t.Log(fnPoints2String(centers))
- }
- }
測試輸出
- $ go test -v k_means_test.go
- === RUN Test_KMeans
- k_means_test.go:53: p(7,6)
- k_means_test.go:53: p(5,9) p(7,3)
- k_means_test.go:53: p(9,10) p(3,10) p(7,3)
- --- PASS: Test_KMeans (0.00s)
- PASS
- ok command-line-arguments 0.002s
IPoint.go
樣本點(diǎn)接口, 其實(shí)是一個(gè)空接口
- package km
- import "fmt"
- type IPoint interface {
- fmt.Stringer
- }
IDistanceCalculator.go
距離計(jì)算器接口
- package km
- type IDistanceCalculator interface {
- Calc(a, b IPoint) int
- }
IClassifier.go
分類器接口, 將samples聚類成k個(gè), 并返回k個(gè)中心點(diǎn)
- package km
- type IClassifier interface {
- // 將samples聚類成k個(gè), 并返回k個(gè)中心點(diǎn)
- Classify(samples []IPoint, calc IDistanceCalculator, k int) []IPoint
- }
tPerson.go
病例樣本點(diǎn), 實(shí)現(xiàn)IPoint接口, 含x,y坐標(biāo)
- package km
- import "fmt"
- type tPerson struct {
- x int
- y int
- }
- func NewPerson(x, y int) IPoint {
- return &tPerson{x, y, }
- }
- func (me *tPerson) String() string {
- return fmt.Sprintf("p(%v,%v)", me.x, me.y)
- }
tPersonDistanceCalculator.go
病例距離計(jì)算器, 計(jì)算兩點(diǎn)間x,y坐標(biāo)的直線距離
- package km
- type tPersonDistanceCalculator struct {
- }
- var gMaxInt = 0x7fffffff_ffffffff
- func newPersonDistanceCalculator() IDistanceCalculator {
- return &tPersonDistanceCalculator{}
- }
- func (me *tPersonDistanceCalculator) Calc(a, b IPoint) int {
- if a == b {
- return 0
- }
- p1, ok := a.(*tPerson)
- if !ok {
- return gMaxInt
- }
- p2, ok := b.(*tPerson)
- if !ok {
- return gMaxInt
- }
- dx := p1.x - p2.x
- dy := p1.y - p2.y
- d := dx*dx + dy*dy
- if d < 0 {
- panic(d)
- }
- return d
- }
- var PersonDistanceCalculator = newPersonDistanceCalculator()
tKMeansClassifier.go
k-means聚類器, 實(shí)現(xiàn)IClassifier接口.
- package km
- import (
- "math/rand"
- "time"
- )
- type tKMeansClassifier struct {
- }
- type tPointEntry struct {
- point IPoint
- distance int
- index int
- }
- func newPointEntry(p IPoint, d int, i int) *tPointEntry {
- return &tPointEntry{
- p, d, i,
- }
- }
- func newKMeansClassifier() IClassifier {
- return &tKMeansClassifier{}
- }
- // 將samples聚類成k個(gè), 并返回k個(gè)中心點(diǎn)
- func (me *tKMeansClassifier) Classify(samples []IPoint, calc IDistanceCalculator, k int) []IPoint {
- sampleCount := len(samples)
- if sampleCount <= k {
- return samples
- }
- // 初始化, 隨機(jī)選擇k個(gè)中心點(diǎn)
- rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
- centers := make([]IPoint, k)
- for selected, i:= make(map[int]bool, 0), 0;i < k; {
- n := rnd.Intn(sampleCount)
- _,ok := selected[n]
- if !ok {
- selected[n] = true
- centers[i] = samples[n]
- i++
- }
- }
- // 根據(jù)到中心點(diǎn)的距離, 劃分samples
- for {
- groups := me.split(samples, centers, calc)
- newCenters := make([]IPoint, k)
- for i,g := range groups {
- newCenters[i] = me.centerOf(g, calc)
- }
- if me.groupEquals(centers, newCenters) {
- return centers
- }
- centers = newCenters
- }
- }
- // 將樣本點(diǎn)距離中心點(diǎn)的距離進(jìn)行分簇
- func (me *tKMeansClassifier) split(samples []IPoint, centers []IPoint, calc IDistanceCalculator) [][]IPoint {
- k := len(centers)
- result := make([][]IPoint, k)
- for i := 0;i<k;i++ {
- result[i] = make([]IPoint, 0)
- }
- entries := make([]*tPointEntry, k)
- for i,c := range centers {
- entries[i] = newPointEntry(c, 0, i)
- }
- for _,p := range samples {
- for _,e := range entries {
- e.distance = calc.Calc(p, e.point)
- }
- center := me.min(entries)
- result[center.index] = append(result[center.index], p)
- }
- return result
- }
- // 計(jì)算一簇樣本的重心. 重心就是距離各點(diǎn)的總和最小的點(diǎn)
- func (me *tKMeansClassifier) centerOf(samples []IPoint, calc IDistanceCalculator) IPoint {
- entries := make([]*tPointEntry, len(samples))
- for i,src := range samples {
- distance := 0
- for _,it := range samples {
- distance += calc.Calc(src, it)
- }
- entries[i] = newPointEntry(src, distance, i)
- }
- return me.min(entries).point
- }
- // 判斷兩組點(diǎn)是否相同
- func (me *tKMeansClassifier) groupEquals(g1, g2 []IPoint) bool {
- if len(g1) != len(g2) {
- return false
- }
- for i,v := range g1 {
- if g2[i] != v {
- return false
- }
- }
- return true
- }
- // 查找距離最小的點(diǎn)
- func (me *tKMeansClassifier) min(entries []*tPointEntry) *tPointEntry {
- minI := 0
- minD := gMaxInt
- for i,it := range entries {
- if it.distance < minD {
- minI = i
- minD = it.distance
- }
- }
- return entries[minI]
- }
- var KMeansClassifier = newKMeansClassifier()