TypeError: No matching signature found
chapmanjacobd opened this issue · comments
I tried to use this as a drop-in replacement for KMeans but I get an error:
File ~/.local/lib/python3.11/site-packages/pdc_dp_means/dpmeans.py:61, in _dpmeans_single_lloyd(X, sample_weight, centers_init, max_iter, verbose, x_squared_norms, tol, n_threads, delta, max_clusters)
59 for i in range(max_iter):
60 tic = time()
---> 61 lloyd_iter(
62 X,
63 sample_weight,
64 x_squared_norms,
65 centers,
66 centers_new,
67 weight_in_clusters,
68 labels,
69 center_shift,
70 max_index,
71 max_distance,
72 n_threads,
73 update_centers=True,
74 )
76 if verbose:
77 inertia = _inertia(X, sample_weight, centers, labels, n_threads)
File pdc_dp_means/dp_means_cython.pyx:21, in pdc_dp_means.dp_means_cython.__pyx_fused_cpdef()
TypeError: No matching signature found
> /home/xk/github/xk/lb/pdc_dp_means/dp_means_cython.pyx(21)pdc_dp_means.dp_means_cython.__pyx_fused_cpdef()
ipdb> a
ipdb> u
> /home/xk/.local/lib/python3.11/site-packages/pdc_dp_means/dpmeans.py(61)_dpmeans_single_lloyd()
59 for i in range(max_iter):
60 tic = time()
---> 61 lloyd_iter(
62 X,
63 sample_weight,
ipdb> a
X = <31446x35104 sparse matrix of type '<class 'numpy.float64'>'
with 246493 stored elements in Compressed Sparse Row format>
sample_weight = array([1., 1., 1., ..., 1., 1., 1.])
centers_init = array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
max_iter = 300
verbose = 0
x_squared_norms = array([1., 1., 1., ..., 1., 1., 1.])
tol = 2.764583521460033e-09
n_threads = 14
delta = 1.0
max_clusters = None
Here is my code:
from pdc_dp_means import DPMeans
from sklearn.feature_extraction.text import TfidfVectorizer
sentence_strings = (path_to_sentence(s) for s in paths)
joined_strings = []
for doc in nlp.pipe(sentence_strings, n_process=4):
joined_strings.append(" ".join([token.lower_ for token in doc if not token.is_stop]))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(joined_strings)
kmeans = DPMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0).fit(X)
clusters = kmeans.labels_
Hi,
Could you please provide information on your OS, scitkit-learn version and numpy version?
Also, can you try with non-sparse matrix?
hmm I'm not sure how to try a nonsparse matrix. I'll look into it
cat /etc/os-release
NAME="Fedora Linux"
VERSION="38 (KDE Plasma)"
...
pip freeze | grep -iE '^sci|^num'
numpy==1.23.4
scikit-learn==1.2.2
scipy==1.9.3
$ git clone https://github.com/chapmanjacobd/library
xklb/utils.py
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
────────────────────────────────────────┐
1105: def load_spacy_model(model=None): │
────────────────────────────────────────┘
def cluster_paths(paths, model=None, n_clusters=None):
nlp = load_spacy_model(model)
- from sklearn.cluster import KMeans
+ from pdc_dp_means import DPMeans
from sklearn.feature_extraction.text import TfidfVectorizer
sentence_strings = (path_to_sentence(s) for s in paths)
─────────────────────────────────────────────────────────────┐
1117: def cluster_paths(paths, model=None, n_clusters=None): │
─────────────────────────────────────────────────────────────┘
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(joined_strings)
- kmeans = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0).fit(X)
+ kmeans = DPMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0).fit(X)
clusters = kmeans.labels_
$ ipython --pdb -m xklb.lb cs ~/mc/tabs
Thanks, env shouldn't be an issue then.
Could you try the toy example:
from sklearn.datasets import make_blobs
from pdc_dp_means import DPMeans
# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Apply DPMeans clustering
dpmeans = DPMeans(n_clusters=1,n_init=10, delta=10) # n_init and delta parameters
dpmeans.fit(X)
# Predict the cluster for each data point
y_dpmeans = dpmeans.predict(X)
# Plotting clusters and centroids
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=y_dpmeans, s=50, cmap='viridis')
centers = dpmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.show()
This will use a non sparse matrix, and I suspect that this is the problem.