@EMS , , .
( EMS !), - , .
, , . , .
pandas "" , , , "" numpy.
, , : ( ).
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
total_num = 100000
x, y = np.random.normal(0, 1, (2, total_num))
subset_num = 1000
nbins = int(np.sqrt(subset_num))
xbins = np.linspace(x.min(), x.max(), nbins+1)
ybins = np.linspace(y.min(), y.max(), nbins+1)
i, j = np.digitize(y, ybins), np.digitize(x, xbins)
df = pd.DataFrame(dict(x=x, y=y), index=[i, j])
groups = df.groupby(df.index)
new = groups.agg(lambda x: np.random.permutation(x)[0])
fig, axes = plt.subplots(ncols=2, sharex=True, sharey=True)
axes[0].plot(x, y, 'k.')
axes[0].set_title('Original $(n={})$'.format(total_num))
axes[1].plot(new.x, new.y, 'k.')
axes[1].set_title('Subset $(n={})$'.format(len(new)))
plt.setp(axes, aspect=1, adjustable='box-forced')
fig.tight_layout()
plt.show()

@EMS , .
, , , .
scipy.stats.gaussian_kde ( ). . (, ..). . , 1e5 .
, , . , , , .
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
total_num = 100000
subset_num = 1000
x, y = np.random.normal(0, 1, (2, total_num))
xy = np.vstack([x, y])
dens = gaussian_kde(xy)(xy)
weight = 1 / dens
weight /= weight.sum()
dat = xy.T.ravel().view([('x', float), ('y', float)])
subset = np.random.choice(dat, subset_num, p=weight)
fig, axes = plt.subplots(ncols=2, sharex=True, sharey=True)
axes[0].scatter(x, y, c=dens, edgecolor='')
axes[0].set_title('Original $(n={})$'.format(total_num))
axes[1].plot(subset['x'], subset['y'], 'k.')
axes[1].set_title('Subset $(n={})$'.format(len(subset)))
plt.setp(axes, aspect=1, adjustable='box-forced')
fig.tight_layout()
plt.show()
