K- mean เป็นอัลกอริทึมการจัดกลุ่มแบบไม่ได้รับการจัดการมาตรฐานซึ่งกำหนดชุดของ "คะแนน" และกลุ่มของ K จำนวนหนึ่งจะกำหนด "จุด" แต่ละจุดให้กับหนึ่งในกลุ่ม K

Pseudo-Code ของ K-mean

โปรดทราบว่า K-Mean มีหลากหลายรูปแบบ คุณต้องใช้อัลกอริทึมที่ฉันอธิบายด้านล่าง คุณอาจมีการเปลี่ยนแปลงบางอย่างเกี่ยวกับอัลกอริทึมหรือใช้บิวด์อินตราบเท่าที่คุณจะได้รับผลลัพธ์เช่นเดียวกับอัลกอริทึมนี้ที่ให้คะแนนเริ่มต้นเหมือนกัน

ในการท้าทายนี้อินพุตทั้งหมดจะเป็นจุดบนระนาบ 2D (แต่ละจุดจะถูกแทนด้วยพิกัดใน x และ y)

Inputs: K, the number of clusters
        P, the set of points

Choose K points of P uniformly at random
Each chosen point is the initial centroid of its cluster

Loop:
     For each point in P:
         Assign to the cluster whose centroid is the nearest (Euclidean distance)
         In case of a tie, any of the tied cluster can be chosen

     Recompute the centroid of each cluster:
         Its x coordinate is the average of all x's of the points in the cluster
         Its y coordinate is the average of all y's of the points in the cluster

Until the clusters don't change from one iteration to the next

Output: the set of clusters

อินพุตและเอาต์พุต

คุณอาจใช้ K และ P ผ่านSTDINหรือเป็นอาร์กิวเมนต์ของฟังก์ชัน ฯลฯ
P และคะแนนใน P สามารถแสดงโดยใช้โครงสร้างใด ๆ ที่เป็นธรรมชาติสำหรับชุด / รายการในภาษาที่คุณเลือก
K เป็นจำนวนเต็มบวกอย่างเคร่งครัด
คุณอาจจะสมมติว่าอินพุตนั้นถูกต้อง
จะต้องมีอย่างน้อย K คะแนนเป็น P
คุณสามารถส่งออกกลุ่มไปยังSTDOUTส่งกลับจากฟังก์ชั่น ฯลฯ
การเรียงลำดับของคลัสเตอร์และการเรียงลำดับภายในคลัสเตอร์นั้นไม่สำคัญ - คุณสามารถส่งคืนกลุ่มของจุดเพื่อเป็นตัวแทนของกลุ่มหรือแต่ละจุดที่มีป้ายระบุสำหรับกลุ่ม (เช่นจำนวนเต็ม)

กรณีทดสอบ

เนื่องจากกลุ่มผลลัพธ์นั้นขึ้นอยู่กับจุดที่เลือกไว้ในตอนแรกคุณอาจไม่ได้รับผลลัพธ์เดียวกันทั้งหมด (หรือผลลัพธ์เดียวกันทุกครั้งที่คุณเรียกใช้รหัสของคุณ)

ดังนั้นให้ใช้เอาต์พุตเป็นตัวอย่างเอาต์พุตเท่านั้น

Input:
  K = 1
  P = [[1,2.5]]
Output:
  [[[1,2.5]]]

Input:
  K = 3
  P = [[4,8], [15,16], [23,42], [-13.37,-12.1], [666,-666]]
Output:
  [[[666,-666]],[[-13.37,-12.1],[4,8]],[[15,16],[23,42]]]

Input:
  K = 2
  P = [[1,1], [1,1], [1,1]]
Output:
  [[[1,1]],[[1,1],[1,1]]]

เกณฑ์การให้คะแนน

นี่คือรหัสกอล์ฟดังนั้นคำตอบที่สั้นที่สุดในหน่วยไบต์ชนะ

code-golf random classification

— Fatalize
แหล่งที่มา

มีการอนุญาตให้ใช้บิวด์อินเมื่อผลลัพธ์ไม่สามารถแยกออกจากอัลกอริทึมของคุณได้หรือไม่

— Martin Ender

@ MartinBüttnerหากคุณสามารถพิสูจน์ได้ว่าได้รับคะแนนเริ่มต้นเดียวกันมันจะรวมกันเป็นผลลัพธ์เดียวกันใช่

— เสียชีวิต

มันดีไหมที่จะยอมรับเลเบลเอาต์พุตของการเป็นสมาชิกของคลัสเตอร์สำหรับแต่ละจุด? (เช่นทุกจุดของคลัสเตอร์แรกมีเลเบล1ทุกจุดของคลัสเตอร์ที่สองมีเลเบลและ2อื่น ๆ )

— ข้อผิดพลาด

@ flawr ใช่มันเป็นที่ยอมรับ

— เสียชีวิต

K=2, P = [[1,1], [1,1], [1,1]]กรณีทดสอบเลว:

— Peter Taylor

4

Matlab ขนาด 25 ไบต์

@(x,k)kmeans(x,k,'S','u')

รับn x 2เมทริกซ์ (หนึ่งแถวต่อจุดเช่น[[4,8]; [15,16]; [23,42]; [-13.37,-12.1]; [666,-666]]) ฟังก์ชั่นนี้จะส่งคืนรายการฉลากสำหรับแต่ละจุดอินพุต

— flawr
แหล่งที่มา

5

C ++, 479 474 ไบต์

เพียง 20x เท่า Matlab!

แข็งแรงเล่นกอล์ฟ

#define V vector<P>
#define f float
struct P{f x,y,i=0;f d(P&p){return(p.x-x)*(p.x-x)+(p.y-y)*(p.y-y);}f n(P&p){return i?x/=i,y/=i,d(p):x=p.x,y=p.y,0;}f a(P&p){x+=p.x,y+=p.y,i++;}};P z;int l(P a,P b){return a.d(z)<b.d(z);}f m(f k,V&p){f s=p.size(),i,j=0,t=1;V c(k),n=c,d;for(random_shuffle(p.begin(),p.end());j<k;c[j].i=j++)c[j]=p[j];for(;t;c=n,n=V(k)){for(i=0;i<s;i++)d=c,z=p[i],sort(d.begin(),d.end(),l),j=d[0].i,p[i].i=j,n[j].a(p[i]);for(j=t=0;j<k;j++)t+=n[j].n(c[j]);}}

อินพุต / เอาต์พุตไปยังอัลกอริทึมคือชุดของจุด ( struct P) ด้วยxและy; และเอาต์พุตเป็นชุดเดียวกันโดยมีการiตั้งค่าเพื่อระบุดัชนีของคลัสเตอร์เอาต์พุตที่จุดสิ้นสุด

พิเศษiนั้นยังใช้เพื่อระบุกลุ่ม ในลูปหลักนั้นเซนทรอยด์ที่ใกล้เคียงที่สุดในแต่ละจุดนั้นจะถูกค้นพบโดยการคัดลอกเซนทรอยด์ที่อยู่ใกล้ ๆ กับจุดนั้น

วิธีนี้จะจัดการกับกรณีที่เลวลง (กลุ่มที่ว่างเปล่า) โดยการรักษาตำแหน่งก่อนหน้าของ centroids ที่สอดคล้องกัน (ดูคำนิยามของP::nซึ่งยังส่งกลับระยะทางไปยังเซนทรอยด์ก่อนหน้า) สามารถบันทึกสองสามตัวอักษรโดยสมมติว่าสิ่งเหล่านี้จะไม่ครอบ

Ungolfed ด้วยหลัก

#include <cstdio>
#include <ctime>
#include <cstdlib>
#include <vector>
#include <algorithm>
using namespace std;

#define V vector<P>
#define f float
struct P{
    f x,y,i=0;
    f d(P&p){return(p.x-x)*(p.x-x)+(p.y-y)*(p.y-y);} // distance squared
    f n(P&p){return i?x/=i,y/=i,d(p):x=p.x,y=p.y,0;} // normalize-or-reset
    f a(P&p){x+=p.x,y+=p.y,i++;}                     // add coordinates
};
P z;int l(P a,P b){return a.d(z)<b.d(z);}            // closer-to-z comparator 
f m(f k,V&p){
    f s=p.size(),i,j=0,t=1;V c(k),n=c,d;
    for(random_shuffle(p.begin(),p.end());j<k;c[j].i=j++)
        c[j]=p[j];                                // initial random assignment
    for(;t;c=n,n=V(k)){                           
        for(i=0;i<s;i++)                          // assign to clusters
            d=c,z=p[i],sort(d.begin(),d.end(),l),
            j=d[0].i,p[i].i=j,n[j].a(p[i]);       // and add those coords
        for(j=t=0;j<k;j++)t+=n[j].n(c[j]);        // normalize & count changes
    }        
}

int main(int argc, char **argv) {
    srand((unsigned long)time(0));

    int k;
    V p;
    sscanf(argv[1], "%d", &k);
    printf("Input:\n");
    for (int i=2,j=0; i<argc; i+=2, j++) {
        P n;
        sscanf(argv[i], "%f", &(n.x));
        sscanf(argv[i+1], "%f", &(n.y));
        p.push_back(n);
        printf("%d : %f,%f\n", j, p[j].x, p[j].y);
    }

    m(k,p);
    printf("Clusters:\n");
    for (int q=0; q<k; q++) {
        printf("%d\n", q);
        for (unsigned int i=0; i<p.size(); i++) {
            if (p[i].i == q) printf("\t%f,%f (%d)\n", p[i].x, p[i].y, i);
        }
    }
    return 0;
}

— tucuxi
แหล่งที่มา

ฉันรู้ว่าฉันอาจมาสายในความคิดเห็นนี้ แต่คุณสามารถกำหนดแมโคร#define R p){returnและเปลี่ยนอาร์กิวเมนต์ที่สองของlเป็นpเพื่อให้คุณสามารถใช้งานได้ทั้งหมดสามครั้ง

— Zacharý

4

J, 60 54 ไบต์

p=:[:(i.<./)"1([:+/&.:*:-)"1/
]p](p(+/%#)/.[)^:_(?#){]

กำหนดกริยาตัวช่วยpซึ่งใช้รายการคะแนนและเซนทรอยด์และจำแนกแต่ละจุดด้วยดัชนีของเซนทรอยด์ที่ใกล้ที่สุด จากนั้นจะใช้วิธีนี้เพื่อทำซ้ำกระบวนการในการเลือกเซนทรอยด์ใหม่โดยการหาค่าเฉลี่ยของคะแนนในแต่ละคลัสเตอร์จนกว่ามันจะมาบรรจบกัน

การใช้

ค่าของkถูกกำหนดเป็นจำนวนเต็มบน LHS รายการคะแนนจะถูกกำหนดให้เป็นอาร์เรย์ 2d บน RHS ที่นี่มีการระบุไว้เป็นรายการของจุดที่มีการปรับเปลี่ยนเป็นอาร์เรย์ 2d จาก 5 x 2 เอาท์พุทจะเป็นฉลากที่คลัสเตอร์แต่ละจุดเป็นลำดับเดียวกันกับการป้อนข้อมูล

หากคุณต้องการที่จะใช้เมล็ดพันธุ์คงที่สำหรับผลลัพธ์ที่เที่ยงตรงแทนที่?ด้วยที่?.(?#)

   p =: [:(i.<./)"1([:+/&.:*:-)"1/
   f =: ]p](p(+/%#)/.[)^:_(?#){]
   3 f (5 2 $ 4 8 15 16 23 42 _13.37 _12.1 666 _666)
0 1 1 0 2

คำอธิบาย

[:(i.<./)"1([:+/&.:*:-)"1/  Input: points on LHS, centroids on RHS
           (          )"1/  Form a table between each point and centroid and for each
                     -        Find the difference elementwise
            [:     *:         Square each
              +/&.:           Reduce using addition
                              Apply the inverse of square (square root) to that sum
[:(     )"1                 For each row of that table
     <./                      Reduce using min
   i.                         Find the index of the minimum in that row
                            Returns a list of indices for each point that shows
                            which centroid it belongs to

]p](p(+/%#)/.[)^:_(?#){]  Input: k on LHS, points on RHS
                    #     Count the number of points
                   ?      Choose k values in the range [0, len(points))
                          without repetition
                       ]  Identity function, get points
                      {   Select the points at the indices above
  ]                       Identity function, get points
   (         )^:_         Repeat until convergence
    p                       Get the labels for each point
             [              Identity function, get points
           /.               Partition the points using the labels and for each
      +/                      Take the sums of points elementwise
         #                    Get the number of points
        %                     Divide sum elementwise by the count
                            Return the new values as the next centroids
]                         Identity function, get points
 p                        Get the labels for each point and return it

— ไมล์
แหล่งที่มา

ฉันจะให้ +1 แต่ฉันกลัวว่าการทำลาย 3k ของคุณจะสาปแช่งฉัน

— NoOne อยู่ที่นี่

3

CJam (60 ไบต์)

{:Pmr<1/2P,#{:z{_:+\,/}f%:C,{P{C\f{.-Yf#:+}_:e<#1$=},\;}%}*}

นี่คือฟังก์ชั่นที่รับอินพุตในรูปแบบk pบนสแต็ก สันนิษฐานว่าเป็นคะแนนที่มีคู่เป็นคู่ไม่ใช่ ints มันไม่ได้อนุมานอะไรเกี่ยวกับมิติของจุดโดยปริยายดังนั้นมันจะรวมกลุ่มกันในพื้นที่ยูคลิด 6-D เหมือนใน 2-D ที่ระบุ

การสาธิตออนไลน์

— ปีเตอร์เทย์เลอร์
แหล่งที่มา

2

Mathematica 14 12 ไบต์

เนื่องจากอนุญาตให้มีบิวด์อินจึงควรทำเช่นนี้

FindClusters

ตัวอย่าง

FindClusters[{{4, 8}, {15, 16}, {23, 42}, {-13.37, -12.1}, {666, -666}}, 3]

{{{4, 8}, {-13.37, -12.1}}, {{15, 16}, {23, 42}}, {{666, -666}}}

— DavidC
แหล่งที่มา

คุณไม่จำเป็นต้องใช้วงเล็บ f = FindClusters, f[something].

— NoOne อยู่ที่นี่

โอเคขอบคุณฉันไม่แน่ใจ

— DavidC

1

เยลลี่ 24 ไบต์

_ÆḊ¥þ³i"Ṃ€$
ẊḣµÇÆmƙ³µÐLÇ

ลองออนไลน์!

ใช้คุณสมบัติที่นำมาใช้หลังจากการโพสต์ความท้าทายนี้ เฉกเช่นนี้ไม่มีที่ไม่ใช่การแข่งขัน

คำอธิบาย

_ÆḊ¥þ³i"Ṃ€$  Helper link. Input: array of points
             (Classify) Given a size-k array of points, classifies
             each point in A to the closet point in the size-k array
    þ        Outer product with
     ³       All points, P
   ¥         Dyadic chain
_              Subtract
 ÆḊ            Norm
          $  Monadic chain
      i"     Find first index, vectorized
        Ṃ€   Minimum each

ẊḣµÇÆmƙ³µÐLÇ  Main link. Input: array of points P, integer k
  µ           Start new monadic chain
Ẋ               Shuffle P
 ḣ              Take the first k
        µ     Start new monadic chain
   Ç            Call helper (Classify)
      ƙ         Group with those values the items of
       ³        All points, P
    Æm            Take the mean of each group
         ÐL   Repeat that until the results converge
           Ç  Call helper (Classify)

— ไมล์
แหล่งที่มา

1

R , 273 ไบต์

function(K,P,C=P[sample(nrow(P),K),]){while(T){D=C
U=sapply(1:nrow(P),function(i)w(dist(rbind(P[i,],C))[1:K]))
C=t(sapply(1:K,function(i)colMeans(P[U==i,,drop=F])))
T=isTRUE(all.equal(C,D))}
cbind(U,P)}
w=function(x,y=seq_along(x)[x==min(x)])"if"(length(y)>1,sample(y,1),y)

ลองออนไลน์!

ใช้Pเป็นเมทริกซ์โดยมีxและyพิกัดในคอลัมน์แรกและคอลัมน์ที่สองตามลำดับ ส่งคืนPพร้อมคอลัมน์แรกที่เพิ่มซึ่งระบุดัชนีคลัสเตอร์ (จำนวนเต็ม)

ฉันต้องกำหนดใหม่wโดยการคัดลอกแหล่งที่มาจากnnet::which.is.maxเพื่อให้สอดคล้องกับข้อกำหนดที่กลุ่มจะถูกเลือกแบบสุ่มในกรณีที่มีความสัมพันธ์ มิฉะนั้นฉันจะใช้which.minจากbaseทั้งหมด 210 ไบต์ ยังคงมีห้องสำหรับเล่นกอล์ฟอยู่ แต่ฉันไม่ต้องการทำให้สับสนมากนักเพื่อให้ผู้อื่นมีโอกาสเห็นปัญหาที่เป็นไปได้ภายในรหัสของฉัน

— Jayce
แหล่งที่มา

0

Julia 213 ไบต์

function f(p,k)
A=0
P=size(p,1)
c=p[randperm(P)[1:k],:]
while(true)
d=[norm(c[i]-p[j]) for i in 1:k, j in 1:P]
a=mapslices(indmin,d,1)
a==A&&return a
A=a
c=[mean(p[vec(a.==i),:],1) for i in 1:k]
end
end

ส่งคืนอาร์เรย์ที่มีความยาวเท่าpกันโดยมีจำนวนเต็มที่ระบุว่าคลัสเตอร์ใดเป็นองค์ประกอบที่เกี่ยวข้องของคลัสเตอร์pนั้น

ฉันคิดว่ายังคงมีขอบเขตที่ค่อนข้างยุติธรรมในการปรับตัวละครให้เหมาะสม

(แน่นอนฉันสามารถใช้แพ็คเกจ Clustering.jl เพื่อทำสิ่งเล็กน้อย)

— Lyndon White
แหล่งที่มา

กอล์ฟอัลกอริทึม K- หมายถึง

Pseudo-Code ของ K-mean

อินพุตและเอาต์พุต

กรณีทดสอบ

เกณฑ์การให้คะแนน

Matlab ขนาด 25 ไบต์

C ++, 479 474 ไบต์

J, 60 54 ไบต์

การใช้

คำอธิบาย

CJam (60 ไบต์)

Mathematica 14 12 ไบต์

ตัวอย่าง

เยลลี่ 24 ไบต์

คำอธิบาย

R , 273 ไบต์

Julia 213 ไบต์