logging.basicConfig(level = logging.WARNING)
df = pd.DataFrame({'hello':[1,2,3,4,5,5]})
df
hello
0 1
1 2
2 3
3 4
4 5
5 5
from random import randint
from randstr import randstr
largeDf = pd.DataFrame({'hello':[randint(1,100000) for _ in range(1000)], 
                        'hello2':[randint(1,100000) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                        randstr(5):[randstr(30) for _ in range(1000)],
                       })

getDfHash

getDfHash[source]

getDfHash(df:DataFrame, hashingAlgorithm:Callable=<lambda>)

get a hash of a pandas dataframe this uses sha1 algorithm inputs: df: pd.DataFrame: a pandas dataframe hashingAlgoritm: callable: a hasing function which takes bytes input response: string hash

%%timeit
getDfHash(largeDf)
1.92 ms ± 62.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%%timeit
import joblib
joblib.hash(largeDf)
16.5 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
def testFeather(df):
  f:BytesIO = BytesIO()
  df.to_feather(f)
  
%timeit testFeather(largeDf)
2.04 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Local cache and hash

saveLocalCache[source]

saveLocalCache(data:DataFrame, path:str='/tmp/cache', saveHash:bool=True, force:bool=True)

save cache of the dataframe to local location data:pd.DataFrame: dataframe to save path: str: path to save cache saveHash: bool: whether to save the hash digest

saveLocalHash[source]

saveLocalHash(data:DataFrame, path='/tmp/cache.hash', force=False)

loadLocalCache[source]

loadLocalCache(path='/tmp/cache', throw=True)

loadLocalHash[source]

loadLocalHash(path='/tmp/cache.hash')

%time saveLocalCache(df,force = True)
%time saveLocalHash(df)
%time print(loadLocalHash())
%time loadLocalCache()
CPU times: user 2.43 ms, sys: 0 ns, total: 2.43 ms
Wall time: 2.9 ms
CPU times: user 519 µs, sys: 2 µs, total: 521 µs
Wall time: 502 µs
da39a3ee5e6b4b0d3255bfef95601890afd80709
CPU times: user 452 µs, sys: 0 ns, total: 452 µs
Wall time: 362 µs
CPU times: user 2.44 ms, sys: 0 ns, total: 2.44 ms
Wall time: 2.08 ms
hello
0 1
1 2
2 3
3 4
4 5
5 5

Remote cache and hash

saveRemoteHash[source]

saveRemoteHash(data:DataFrame, key='', bucket='', **kwargs)

saveRemoteCache[source]

saveRemoteCache(data:DataFrame, key='', bucket='', localCachePath='/tmp/cache', localHashPath='/tmp/hash', **kwargs)

loadRemoteCache[source]

loadRemoteCache(key='', bucket='', **kwargs)

loadRemoteHash[source]

loadRemoteHash(key='', bucket='', **kwargs)

PynamoAttributes

class PandasDataFrameAttribute[source]

PandasDataFrameAttribute(*args, **kwds) :: Attribute

An attribute of a model

class Database(Model):
  class Meta:
    table_name = ''
    region = ''
    billing_mode='PAY_PER_REQUEST'
    
  brcode = UnicodeAttribute(hash_key=True, default = '')
  data = PandasDataFrameAttribute()
  
import sys
df = pd.DataFrame({'cprcode':['1234', '12345'], 'quantity':[123, 345]})
db = Database(brcode='1234', data = df)
db.data
cprcode quantity
0 1234 123
1 12345 345
url = 'https://raw.githubusercontent.com/thanakijwanavit/villaMasterSchema/dev-manual/inventory/inventory.yaml'


inv = {
                  'iprcode': '0000009',
                  'brcode': '1000',
                  'ib_cf_qty': '50',
                  'new_ib_vs_stock_cv': '27',
                  'onlineflag': True,
                  'unknownError': 123
                }
getTypes(url)
{'iprcode': int,
 'brcode': int,
 'ib_cf_qty': int,
 'new_ib_vs_stock_cv': int,
 'onlineflag': bool}

forceType[source]

forceType(url:str, df:DataFrame, defaultType=str)

forceType(url, pd.DataFrame([inv]))
{'iprcode': <class 'int'>, 'brcode': <class 'int'>, 'ib_cf_qty': <class 'int'>, 'new_ib_vs_stock_cv': <class 'int'>, 'onlineflag': <class 'bool'>, 'unknownError': <class 'str'>}
iprcode                int64
brcode                 int64
ib_cf_qty              int64
new_ib_vs_stock_cv     int64
onlineflag              bool
unknownError          object
dtype: object
iprcode brcode ib_cf_qty new_ib_vs_stock_cv onlineflag unknownError
0 9 1000 50 27 True 123