这篇文章主要讲解了“怎么使用Python读取Hive数据库”,文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习“怎么使用Python读取Hive数据库”吧!
实际业务读取hive数据库的代码
import logging
import pandas as pd
from impala.dbapi import connect
import sqlalchemy
from sqlalchemy.orm import sessionmaker
import os
import time
import os
import datetime
from dateutil.relativedelta import relativedelta
from typing import Dict, List
import logging
import threading
import pandas as pd
import pickle
class HiveHelper(object):
def __init__(
self,
host='10.2.32.22',
port=21051,
database='ur_ai_dw',
auth_mechanism='LDAP',
user='urbi',
password='Ur#730xd',
logger:logging.Logger=None
):
self.host = host
self.port = port
self.database = database
self.auth_mechanism = auth_mechanism
self.user = user
self.password = password
self.logger = logger
self.impala_conn = None
self.conn = None
self.cursor = None
self.engine = None
self.session = None
def create_table_code(self, file_name):
'''创建表类代码'''
os.system(f'sqlacodegen {self.connection_str} > {file_name}')
return self.conn
def get_conn(self):
'''创建连接或获取连接'''
if self.conn is None:
engine = self.get_engine()
self.conn = engine.connect()
return self.conn
def get_impala_conn(self):
'''创建连接或获取连接'''
if self.impala_conn is None:
self.impala_conn = connect(
host=self.host,
port=self.port,
database=self.database,
auth_mechanism=self.auth_mechanism,
user=self.user,
password=self.password
)
return self.impala_conn
def get_engine(self):
'''创建连接或获取连接'''
if self.engine is None:
self.engine = sqlalchemy.create_engine('impala://', creator=self.get_impala_conn)
return self.engine
def get_cursor(self):
'''创建连接或获取连接'''
if self.cursor is None:
self.cursor = self.conn.cursor()
return self.cursor
def get_session(self) -> sessionmaker:
'''创建连接或获取连接'''
if self.session is None:
engine = self.get_engine()
Session = sessionmaker(bind=engine)
self.session = Session()
return self.session
def close_conn(self):
'''关闭连接'''
if self.conn is not None:
self.conn.close()
self.conn = None
self.dispose_engine()
self.close_impala_conn()
def close_impala_conn(self):
'''关闭impala连接'''
if self.impala_conn is not None:
self.impala_conn.close()
self.impala_conn = None
def close_session(self):
'''关闭连接'''
if self.session is not None:
self.session.close()
self.session = None
self.dispose_engine()
def dispose_engine(self):
'''释放engine'''
if self.engine is not None:
# self.engine.dispose(close=False)
self.engine.dispose()
self.engine = None
def close_cursor(self):
'''关闭cursor'''
if self.cursor is not None:
self.cursor.close()
self.cursor = None
def get_data(self, sql, auto_close=True) -> pd.DataFrame:
'''查询数据'''
conn = self.get_conn()
data = None
try:
# 异常重试3次
for i in range(3):
try:
data = pd.read_sql(sql, conn)
break
except Exception as ex:
if i == 2:
raise ex # 往外抛出异常
time.sleep(60) # 一分钟后重试
except Exception as ex:
self.logger.exception(ex)
raise ex # 往外抛出异常
finally:
if auto_close:
self.close_conn()
return data
pass
class VarsHelper():
def __init__(self, save_dir, auto_save=True):
self.save_dir = save_dir
self.auto_save = auto_save
self.values = {}
if not os.path.exists(os.path.dirname(self.save_dir)):
os.makedirs(os.path.dirname(self.save_dir))
if os.path.exists(self.save_dir):
with open(self.save_dir, 'rb') as f:
self.values = pickle.load(f)
f.close()
def set_value(self, key, value):
self.values[key] = value
if self.auto_save:
self.save_file()
def get_value(self, key):
return self.values[key]
def has_key(self, key):
return key in self.values.keys()
def save_file(self):
with open(self.save_dir, 'wb') as f:
pickle.dump(self.values, f)
f.close()
pass
class GlobalShareArgs():
args = {
"debug": False
}
def get_args():
return GlobalShareArgs.args
def set_args(args):
GlobalShareArgs.args = args
def set_args_value(key, value):
GlobalShareArgs.args[key] = value
def get_args_value(key, default_value=None):
return GlobalShareArgs.args.get(key, default_value)
def contain_key(key):
return key in GlobalShareArgs.args.keys()
def update(args):
GlobalShareArgs.args.update(args)
pass
class ShareArgs():
args = {
"labels_dir":"./hjx/shop_group/month_w_amt/data/labels", # 标签目录
"labels_output_dir":"./hjx/shop_group/month_w_amt/data/labels_output", # 聚类导出标签目录
"common_datas_dir":"./hjx/data", # 共用数据目录。ur_bi_dw的公共
"only_predict": False, # 只识别,不训练
"delete_model": True, # 先删除模型,仅在训练时使用
"export_excel": False, # 导出excel
"classes": 12, # 聚类数
"batch_size": 16,
"hidden_size": 32,
"max_nrof_epochs": 100,
"learning_rate": 0.0005,
"loss_type": "categorical_crossentropy",
"avg_model_num": 10,
"steps_per_epoch": 4.0, # 4.0
"lr_callback_patience": 4,
"lr_callback_cooldown": 1,
"early_stopping_callback_patience": 6,
"get_data": True,
}
def get_args():
return ShareArgs.args
def set_args(args):
ShareArgs.args = args
def set_args_value(key, value):
ShareArgs.args[key] = value
def get_args_value(key, default_value=None):
return ShareArgs.args.get(key, default_value)
def contain_key(key):
return key in ShareArgs.args.keys()
def update(args):
ShareArgs.args.update(args)
pass
class UrBiGetDatasBase():
# 线程锁列表,同保存路径共用锁
lock_dict:Dict[str, threading.Lock] = {}
# 时间列表,用于判断是否超时
time_dict:Dict[str, datetime.datetime] = {}
# 用于记录是否需要更新超时时间
get_data_timeout_dict:Dict[str, bool] = {}
def __init__(
self,
host='10.2.32.22',
port=21051,
database='ur_ai_dw',
auth_mechanism='LDAP',
user='urbi',
password='Ur#730xd',
save_dir=None,
logger:logging.Logger=None,
):
self.save_dir = save_dir
self.logger = logger
self.db_helper = HiveHelper(
host=host,
port=port,
database=database,
auth_mechanism=auth_mechanism,
user=user,
password=password,
logger=logger
)
# 创建子目录
if self.save_dir is not None and not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
self.vars_helper = None
if GlobalShareArgs.get_args_value('debug'):
self.vars_helper = VarsHelper('./hjx/data/vars/UrBiGetDatas')
def close(self):
'''关闭连接'''
self.db_helper.close_conn()
def get_last_time(self, key_name) -> bool:
'''获取是否超时'''
# 转静态路径,确保唯一性
key_name = os.path.abspath(key_name)
if self.vars_helper is not None and self.vars_helper.has_key('UrBiGetDatasBase.time_list'):
UrBiGetDatasBase.time_dict = self.vars_helper.get_value('UrBiGetDatasBase.time_list')
timeout = 12 # 12小时
if GlobalShareArgs.get_args_value('debug'):
timeout = 24 # 24小时
get_data_timeout = False
if key_name not in UrBiGetDatasBase.time_dict.keys() or (datetime.datetime.today() - UrBiGetDatasBase.time_dict[key_name]).total_seconds()>(timeout*60*60):
self.logger.info('超时%d小时,重新查数据:%s', timeout, key_name)
# UrBiGetDatasBase.time_list[key_name] = datetime.datetime.today()
get_data_timeout = True
else:
self.logger.info('未超时%d小时,跳过查数据:%s', timeout, key_name)
# if self.vars_helper is not None :
# self.vars_helper.set_value('UrBiGetDatasBase.time_list', UrBiGetDatasBase.time_list)
UrBiGetDatasBase.get_data_timeout_dict[key_name] = get_data_timeout
return get_data_timeout
def save_last_time(self, key_name):
'''更新状态超时'''
# 转静态路径,确保唯一性
key_name = os.path.abspath(key_name)
if UrBiGetDatasBase.get_data_timeout_dict[key_name]:
UrBiGetDatasBase.time_dict[key_name] = datetime.datetime.today()
if self.vars_helper is not None :
UrBiGetDatasBase.time_dict[key_name] = datetime.datetime.today()
self.vars_helper.set_value('UrBiGetDatasBase.time_list', UrBiGetDatasBase.time_dict)
def get_lock(self, key_name) -> threading.Lock:
'''获取锁'''
# 转静态路径,确保唯一性
key_name = os.path.abspath(key_name)
if key_name not in UrBiGetDatasBase.lock_dict.keys():
UrBiGetDatasBase.lock_dict[key_name] = threading.Lock()
return UrBiGetDatasBase.lock_dict[key_name]
def get_data_of_date(
self,
save_dir,
sql,
sort_columns:List[str],
del_index_list=[-1], # 删除最后下标
start_date = datetime.datetime(2017, 1, 1), # 开始时间
offset = relativedelta(months=3), # 时间间隔
date_format_fun = lambda d: '%04d%02d01' % (d.year, d.month), # 查询语句中替代时间参数的格式化
filename_format_fun = lambda d: '%04d%02d.csv' % (d.year, d.month), # 查询语句中替代时间参数的格式化
stop_date = '20700101', # 超过时间则停止
data_format_fun = None, # 格式化数据
):
'''分时间增量读取数据'''
# 创建文件夹
if not os.path.exists(save_dir):
os.makedirs(save_dir)
else:
#删除最后一个文件
file_list = os.listdir(save_dir)
if len(file_list)>0:
file_list.sort()
for del_index in del_index_list:
os.remove(os.path.join(save_dir,file_list[del_index]))
print('删除最后一个文件:', file_list[del_index])
select_index = -1
# start_date = datetime.datetime(2017, 1, 1)
while True:
end_date = start_date + offset
start_date_str = date_format_fun(start_date)
end_date_str = date_format_fun(end_date)
self.logger.info('date: %s-%s', start_date_str, end_date_str)
file_path = os.path.join(save_dir, filename_format_fun(start_date))
# self.logger.info('file_path: %s', file_path)
if not os.path.exists(file_path):
data:pd.DataFrame = self.db_helper.get_data(sql % (start_date_str, end_date_str))
if data is None:
break
self.logger.info('data: %d', len(data))
# self.logger.info('data: %d', data.columns)
if len(data)>0:
select_index+=1
if data_format_fun is not None:
data = data_format_fun(data)
# 排序
data = data.sort_values(sort_columns)
data.to_csv(file_path)
elif select_index!=-1:
break
elif stop_date < start_date_str:
raise Exception("读取数据异常,时间超出最大值!")
start_date = end_date
pass
class UrBiGetDatas(UrBiGetDatasBase):
def __init__(
self,
host='10.2.32.22',
port=21051,
database='ur_ai_dw',
auth_mechanism='LDAP',
user='urbi',
password='Ur#730xd',
save_dir='./hjx/data/ur_bi_dw_data',
logger:logging.Logger=None
):
self.save_dir = save_dir
self.logger = logger
super().__init__(
host=host,
port=port,
database=database,
auth_mechanism=auth_mechanism,
user=user,
password=password,
save_dir=save_dir,
logger=logger
)
def get_dim_date(self):
'''日期数据'''
file_path = os.path.join(self.save_dir,'ur_bi_dw.dim_date.csv')
now_lock = self.get_lock(file_path)
now_lock.acquire() # 加锁
try:
# 设置超时4小时才重新查数据
if not self.get_last_time(file_path):
return
sql = 'SELECT * FROM ur_bi_dw.dim_date'
data:pd.DataFrame = self.db_helper.get_data(sql)
columns = list(data.columns)
columns = {c:'dim_date.'+c for c in columns}
data = data.rename(columns=columns)
data = data.sort_values(['dim_date.date_key'])
data.to_csv(file_path)
# 更新超时时间
self.save_last_time(file_path)
except Exception as ex:
self.logger.exception(ex)
raise ex # 往外抛出异常
finally:
now_lock.release() # 释放锁
def get_dim_shop(self):
'''店铺数据'''
file_path = os.path.join(self.save_dir,'ur_bi_dw.dim_shop.csv')
now_lock = self.get_lock(file_path)
now_lock.acquire() # 加锁
try:
# 设置超时4小时才重新查数据
if not self.get_last_time(file_path):
return
sql = 'SELECT * FROM ur_bi_dw.dim_shop'
data:pd.DataFrame = self.db_helper.get_data(sql)
columns = list(data.columns)
columns = {c:'dim_shop.'+c for c in columns}
data = data.rename(columns=columns)
data = data.sort_values(['dim_shop.shop_no'])
data.to_csv(file_path)
# 更新超时时间
self.save_last_time(file_path)
except Exception as ex:
self.logger.exception(ex)
raise ex # 往外抛出异常
finally:
now_lock.release() # 释放锁
def get_dim_vip(self):
'''会员数据'''
sub_dir = os.path.join(self.save_dir,'vip_no')
now_lock = self.get_lock(sub_dir)
now_lock.acquire() # 加锁
try:
# 设置超时4小时才重新查数据
if not self.get_last_time(sub_dir):
return
sql = '''SELECT dv.*, dd.date_key, dd.date_name2
FROM ur_bi_dw.dim_vip as dv
INNER JOIN ur_bi_dw.dim_date as dd
ON dv.card_create_date=dd.date_name2
where dd.date_key >= %s
and dd.date_key < %s'''
# data:pd.DataFrame = self.db_helper.get_data(sql)
sort_columns = ['dv.vip_no']
# TODO:
self.get_data_of_date(
save_dir=sub_dir,
sql=sql,
sort_columns=sort_columns,
start_date=datetime.datetime(2017, 1, 1), # 开始时间
offset=relativedelta(years=1)
)
# 更新超时时间
self.save_last_time(sub_dir)
except Exception as ex:
self.logger.exception(ex)
raise ex # 往外抛出异常
finally:
now_lock.release() # 释放锁
def get_weather(self):
'''天气数据'''
sub_dir = os.path.join(self.save_dir,'weather')
now_lock = self.get_lock(sub_dir)
now_lock.acquire() # 加锁
try:
# 设置超时4小时才重新查数据
if not self.get_last_time(sub_dir):
return
sql = """
select weather.* from ur_bi_ods.ods_base_weather_data_1200 as weather
where weather.date_key>=%s and weather.date_key<%s
"""
sort_columns = ['weather.date_key','weather.areaid']
def data_format_fun(data):
columns = list(data.columns)
columns = {c:'weather.'+c for c in columns}
data = data.rename(columns=columns)
return data
self.get_data_of_date(
save_dir=sub_dir,
sql=sql,
sort_columns=sort_columns,
del_index_list=[-2, -1], # 删除最后下标
data_format_fun=data_format_fun,
)
&nbs
版权声明:除特别声明外,本站所有文章皆是本站原创,转载请以超链接形式注明出处!