In [1]:
dat1 = {'a': ['1', '2', '3', '4', '5'],
'b': [['a', 'b', 'e'], ['a', 'g'],
['c', 'f'], ['d'], ['b']]}
In [2]:
ans1 = {'a': [['1', '2', '5'], ['3'], ['4']],
'b': [['a', 'b', 'e', 'g'], ['c', 'f'], ['d']]}
In [3]:
dat2 = {'a': ['1', '3', '4', '6', '9'],
'b': [['a', 'b', 'e'], ['a', 'g', 'f'],
['c', 'f'], ['d', 'h'], ['b', 'g', 'h']]}
In [4]:
ans2 = {'a': [['1', '3', '4', '6', '9']],
'b': [['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']]}
In [5]:
def consolidate(sets):
"Python sets orig."
setlist = [set(s) for s in sets if s] # Remove empty containers and convert to sets
for i, s1 in enumerate(setlist):
if s1:
for s2 in setlist[i+1:]:
intersection = s1.intersection(s2)
if intersection:
s2.update(s1)
s1.clear()
s1 = s2
return [s for s in setlist if s]
In [6]:
blist1 = dat1['b']
consolidate(blist1)
Out[6]:
In [7]:
blist2 = dat2['b']
consolidate(blist2)
Out[7]:
In [8]:
import numpy as np
NP_STRINGS = 'U12' # Max size of numpy strings
def consolidate_np(lists):
"Numpy sets"
global NP_STRINGS
np0 = np.array([], dtype=NP_STRINGS)
setlist = [np.array(lst, dtype=NP_STRINGS) for lst in lists]
for i, s1 in enumerate(setlist):
if s1.size:
for j, s2 in enumerate(setlist[i+1:], i+1):
if np.intersect1d(s1, s2, assume_unique=True).size:
s1 = setlist[j] = np.union1d(s2, s1)
setlist[i] = np0
return [s.tolist() for s in setlist if s.size]
In [9]:
consolidate_np(blist1)
Out[9]:
In [10]:
consolidate_np(blist2)
Out[10]:
In [11]:
grouplist = [['a', 'b', 'e'], # 0
['a', 'g', 'f'], # 1
['c', 'f'], # 2
['d', 'h'], # 3
['b', 'g', 'h']] # 4
In [12]:
root = {}
for group in grouplist:
name = group[0]
for item in group:
root[item] = name
root
Out[12]:
In [13]:
def consolidate_ramr(data):
"root-and-merge-roots"
root = {}
grouplist = (d for d in data if d) # Remove null sublists
for group in grouplist:
name = group[0]
for item in group:
prevroot = root.setdefault(item, name) # returns first if item isn't in root
indirections = [name]
while prevroot != name: # Do we need to look back through root?
indirections.append(prevroot)
name, prevroot = prevroot, root[prevroot]
#
for ind in indirections:
root[ind] = prevroot # Root/re-root as necessary
# This flattens any remaining indirections.
for item in root.keys():
parent = root[item]
indirections = []
while item != parent:
indirections.append(item)
item, parent = parent, root[parent]
for ind in indirections:
root[ind] = parent
# re-group on root names
regroup = {}
for item, name in root.items():
regroup.setdefault(name, []).append(item)
return list(regroup.values())
In [14]:
consolidate_ramr(blist1)
Out[14]:
In [15]:
consolidate_ramr(blist2)
Out[15]:
In [16]:
import random
from string import ascii_lowercase
import copy
def weighted_choice(item_weights, bincount=1_000):
'''\
Weighted choice of item from (item, weight), ... pairs.
Puts items in bins in proportion to probs
then uses random.choice() to select items.
Larger bincount for more memory use but
higher accuracy (on avarage).
'''
bins = []
weights = sum(iw[1] for iw in item_weights)
bincount = min([weights, bincount])
for item, wt in item_weights:
bins += [item] * int(bincount * wt / weights)
while True:
yield random.choice(bins)
def rand_data_gen(n=5):
"Gen n sublists of randomised data"
# Need a pool of items to randomly select for each group
n_pool_items = int(2.5 * n)
pool_strings = [f'I{x}' for x in range(n_pool_items)]
# groups will be of ranodom but weighted size,
group_len_weights = ([(n, 5) for n in range(1, 6)] + # lengths of 1..5 weights
[(n, 2) for n in range(6, 10)] + # lengths of 6..9 weights
[(n, 1) for n in range(10, 16)]) # lengths of 10..15 weights
group_len_weights = [(ln, wt) for ln, wt in group_len_weights if ln <= n_pool_items]
group_len_gen = weighted_choice(group_len_weights)
data = [random.sample(pool_strings, next(group_len_gen))
for _ in range(n)]
# stats
avg = sum(len(x) for x in data) / n
print(f"""
GENERATED DATA STATS:
Row count: {n}
Pool of random items: {n_pool_items}
Average count of items per group: {avg}""")
return data
In [17]:
# This is just for repeatability and can be deleted
random.seed('RANDOM TEST DATA GENERATION')
In [18]:
rand_data_gen(5)
Out[18]:
In [19]:
rand_data_gen(10)
Out[19]:
In [20]:
import timeit
import datetime
def same_format(d):
"Make comparable"
return sorted(sorted(lst) for lst in d)
def runtime(methods, random_data):
print(f"\n## \n## TIMING COMPARISONs WITH SAME RANDOM TEST DATA\n##")
rep = 2
results, times = [], []
runtime.random_data = random_data
for method in methods:
print()
name = method.__name__
print(name, method.__doc__)
runtime.result = result = []
cmd = f"runtime.result[:] = [{name}(ddd)]"
setup = (f"from __main__ import copy, {name}, runtime\n"
"ddd = copy.deepcopy(runtime.random_data)")
t = timeit.Timer(cmd, setup
).timeit(number=rep) / rep
print(datetime.timedelta(seconds=t))
results.append(same_format(result[0]))
times.append(t) # seconds
return results, times
In [21]:
from itertools import combinations
def results_compare(methods, ans):
print(f"\n## \n## METHOD RESULT COMPARISON WITH SAME RANDOM TEST DATA\n##")
for data, method in zip(ans, methods):
name = method.__name__
avg = sum(len(x) for x in data) / len(data)
print(f"""\
{name} Result stats:
Consolidated group count: {len(data)}
Average count of items per group: {avg}""")
print()
for c1, c2 in combinations(range(len(ans)), 2):
n1, n2 = methods[c1].__name__, methods[c2].__name__
print(' Data from ', n1, '== Data from', n2, '?',
'YES' if ans[c1] == ans[c2] else 'NO!')
In [32]:
if __name__ == '__main__':
methods = [consolidate, consolidate_np, consolidate_ramr]
n = 100
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [33]:
if __name__ == '__main__':
methods = [consolidate, consolidate_np, consolidate_ramr]
n = 200
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [34]:
if __name__ == '__main__':
methods = [consolidate, consolidate_ramr]
n = 1000
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [35]:
if __name__ == '__main__':
methods = [consolidate, consolidate_ramr]
n = 10_000
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [36]:
if __name__ == '__main__':
methods = [consolidate, consolidate_ramr]
n = 20_000
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [37]:
if __name__ == '__main__':
methods = [consolidate_ramr]
n = 100_000
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
In [38]:
if __name__ == '__main__':
methods = [consolidate_ramr]
n = 1_000_000
data = rand_data_gen(n)
results, times = runtime(methods, data)
results_compare(methods, results)
No comments:
Post a Comment