NLP Tutorials

# encoding: utf-8
# 输入的信息是已经解析为json格式的简历信息集 这个可以用pdfString文件执行这一部分的逻辑# Assumptions: 在解析模块能够准确获得用户的姓名 对应岗位 和 简历正文import pandas as pd
import numpy  as np
import jieba
import json
import picklefrom   collections import Counterimport os
import requests
import sys
import re# self written
import infoextract
import pdfString
from data import Reference
import Try02# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionfrom transformers import BertTokenizer, BertModel
import torch
import logging# This file Asks Json in form [{},{},...,{}]# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data\\Result.csv"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data"
INFOLIST = ["name", "infotext", "jobs"]
STOPWORDSITE = "https://raw.githubusercontent.com/goto456/stopwords/master/cn_stopwords.txt"# 机器学习实体
tfidf   = TfidfTransformer()
counter = CountVectorizer(analyzer = 'word')# 输入简历集
def inputSource (sourcePath):filename = sourcePath.split("\\")[-1]if   (".json" in filename):targetDS  = pd.read_json(sourcePath, encoding = "utf-8")targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")elif (".csv" in filename):targetDS = pd.read_csv(sourcePath, encoding = "utf-8")return  targetDSclass preExtractor (object):# 初始化预解析器def __init__ (self, sourceText, filename):self.fullText = sourceTextself.file_dir = filename# Extract Informationansinfo = infoextract.Extractor(file_dir = self.file_dir, file_text = self.fullText, switch = 1).search()self.info = { "name":         ansinfo["user_name"], "infotext":     self.textCut (content = self.fullText),"jobs":         ansinfo["jobs"]}def textCut (self, content):src = self.textwasher(text = content, quit_universal = True), seg = list(jieba.cut(str(src[0]).strip()))# 去除停用词seg = function.removeStopword(seg)# 去除纯数字seg = list(filter(lambda x: not str(x).isdigit(), seg))return seg# 中文简历文本清洗和去除停用词def textwasher (self, text, quit_universal):# 去除分行 去除关于友邦保险所额外添加的信息项 quit_universal == Trueif (quit_universal):textL = text.split('\n')count = 0while (count < 7):textL.remove(textL[0])count += 1text0 = " ".join(textL[:-5])# 文本清洗# import spacypattern1 = '[’!"#$%&\'()*+,-./:::;<=>?@[\\]^_`{|}~]+'pattern2 = '\\s+'pattern3 = r'[\n|\u3000|\s*$]'pattern4 = re.compile(u'[^\s1234567890::' + '\u4e00-\u9fa5' + 'a-zA-Z]+')text1 = re.sub(pattern1 + pattern2, '', text0)text2 = re.sub(pattern3, '', text1)text3 = re.sub(pattern4, '', text2)return text3class function ():# 对数据集进行解析def extractDS (targetDS):for key in INFOLIST:targetDS[key] = targetDS["Text"].apply(lambda x: "")for index, row in targetDS.iterrows():ansinfo = preExtractor(sourceText = row["Text"], filename = row["File_name"]).infofor key in ansinfo:targetDS.loc[index, key] = str(ansinfo[key])# 从网站上导入停用词def getSiteStopword ():if not os.path.exists('data/stopWord.json'):stopWord = requests.get(STOPWORDSITE)with open("data/stopWord.json", "wb") as f:f.write(stopWord.content)with open("data/stopWord.json", "r") as f:stopWord.STOPLIST += f.read().split("\n")# 去除停用词def removeStopword (wordList):filteredWords = [word for word in wordList if word not in Reference.STOPWORDLIST]return filteredWords# 职业分类def classifyJobs (position):result = "others"pattern1 = re.compile(u'[^\s1234567890::' + 'a-zA-Z]+')position1 = re.sub(pattern1, "", position)if len(position1) > 2:result = position1else:result = positionjobsDict = Reference.JOBS_TYPE_DICTfor key in jobsDict:if key in result:result = jobsDict[key]breakif isinstance(result, str) == True:# return 0return 4 # 现在让不知所云者当BAreturn resultclass   textVary (object):def tf_idf_regression (trainL, testL, y_trainL, y_testL):# Setup tfidf modelinfo_train1 = [' '.join(i) for i in trainL]info_test1  = [' '.join(i) for i in testL]tfidf_train = tfidf.fit_transform(counter.fit_transform(info_train1))tfidf_test  = tfidf.fit_transform(counter.transform(info_test1))print(tfidf_train.shape, tfidf_test.shape)# Train tfidf modelparam_grid = {'C': [0.01, 0.1, 1.0, 2.0, 10, 100], 'penalty' : ['l2']# 'penalty' : ['l1', 'l2']}clf = LogisticRegression()grid_search = GridSearchCV (estimator = clf,param_grid = param_grid,scoring = 'accuracy',cv = 5,n_jobs = -1)grid_search.fit (tfidf_train, y_trainL)print(grid_search.best_params_)print(grid_search.best_score_)lr_best = LogisticRegression(penalty='l2',C=2)lr_best.fit(tfidf_train, y_trainL)tf_idf_y_pred = lr_best.predict(tfidf_test)# print(tf_idf_y_pred)print('TF-IDF LR test accuracy %s' % metrics.accuracy_score(y_testL, tf_idf_y_pred))print('TF-IDF LR test F1_score %s' % metrics.f1_score(y_testL, tf_idf_y_pred, average="macro"))return lr_best'''def word2vec_regression (trainL, testL, y_trainL, y_testL):model = KeyedVectors.load_word2vec_format('data/sgns.zhihu.word')model['']vocabulary = model.vocabvec_lem = model[''].shape[0]grid_search = GridSearchCV( estimator = clf,param_grid = param)'''def bert_regression (trainL, testL, y_trainL, y_testL):# Set-up basic Informationgpu = 0use_cuda = gpu >= 0 and torch.cuda.is_available()print(use_cuda)if use_cuda:torch.cuda.set_device(gpu)device = torch.device("cuda", gpu)else:device = torch.device("cpu")logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)bert_model_dir = 'bert-mini'tokenizer = BertTokenizer.from_pretrained(bert_model_dir)Bertmodel = BertModel.from_pretrained(bert_model_dir)word = ['今天我是一个大笨蛋']input_id = tokenizer(word, padding = True, truncation = True, max_length = 0, return_tensors = 'pt')result = Bertmodel(input_id['input_ids'])print(result)vec_len = len(result[0][0][1])print(vec_len)def train_Model (model):# Step 01################################################################################# 输入信息data = inputSource (sourcePath = FILEPATH)function.extractDS (targetDS = data)ProcessData = data[['name', 'infotext', 'jobs']]# ProcessData['Type'] = ProcessData['jobs'].apply(lambda x: 0)count = 0rowSize = len(ProcessData)ProcessData.insert(loc = len(ProcessData.columns), column = 'Type', value = [0 for i in range(rowSize)])print(ProcessData['infotext'])while (count < rowSize):ProcessData.loc[count, 'Type'] = function.classifyJobs(ProcessData.loc[count, 'jobs'])ProcessData.at[count, 'infotext'] = eval(ProcessData.loc[count, 'infotext'])count += 1# 下面这个是正确的 但是上面的赋值会受到排序不一的干扰 看有没有办法解决# ProcessData['infotext'] = ProcessData['infotext'].apply(lambda x: eval(x))# print(ProcessData)# Sorted by job types# SortedProcessData = ProcessData.sort_values('Type')'''print(SortedProcessData)for index, row in data.iterrows():print(SortedProcessData['jobs'][index] + "  " + str(ProcessData['Type'][index]))AdminData = ProcessData[ProcessData.Type == 3]print(AdminData['infotext'])'''# Step 02################################################################################# 这样我们应该就可以实现分类计算词频了 开始训练 划分训练集和测试集 这些参数可以进行调节X_Set = ProcessData['infotext'] # X infoY_Set = ProcessData['Type']     # Y typetest_ratio = 0.2x_train, x_test, y_train, y_test = train_test_split (X_Set, Y_Set, test_size = test_ratio, random_state = 0)'''print("See Results\n")print(x_train.head(), y_train.head())'''if (model == "tfidf"):fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)f  = open('models/tfidf_model1.pkl', 'wb')pickle.dump(fn, f)f.close()if (model == "bert"):fn = textVary.bert_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)'''f  = open('models/bert_model1.pkl', 'wb')pickle.dump(fn, f)f.close()''''''fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)return fn'''def recommend_Resume (targetPDF):targetFile = pdfString.Transformer(file_dir = targetPDF, quitD = 1).infotargetInfo = preExtractor(sourceText = targetFile["Text"], filename = targetPDF).infotargetText = eval(str(targetInfo["infotext"]))# 训练模型train_Model(model = "tfidf")# train_Model(model = "bert")# 调用模型f  = open('models/tfidf_model1.pkl', 'rb')fn = pickle.load(f)f.close()try1 = [' '.join(targetText)]# print(try1)# print("True: " + str(ProcessData.loc['Type']) + "\n" + ProcessData.loc[num, 'jobs'])# print("True: " + function.classifyJobs(targetInfo['jobs']) + "\n" + targetInfo['jobs'])tfidf_try1 = tfidf.fit_transform(counter.transform(try1))try1_pred  = fn.predict(tfidf_try1)print("PREDICT: " + str(try1_pred) + ": " + Reference.JOB_RECOMMENDATION[try1_pred[0]])################################################################################# tf_idf 向量化# Main Function
if __name__ == "__main__":# 测试一下这个训练结果targetPDF = "Kenny.pdf"recommend_Resume (targetPDF = targetPDF)ansinfo = Try02.Extractor(file_dir = targetPDF).search()Try02.Generator(sourceInfo = ansinfo).display()
# coding:utf-8
# 目前还缺乏研究 如果有多个专业应该怎么处理
# 多种方式比对# 信息 先分块 后解析 准确率和效率提升import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
import sys
# import provinces################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\CV_Automation\ResumeRespo"  PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构################################################################################################
# 参考集 字典# 个人筛选的 200 个常用姓氏 生成字典
Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}
# 专业
Major_List = ['软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术']# 技能
Skillset_List = ['Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目']# 地点
Location_List = ['成都', '广州']# 来源
Vendor_List = ['猎聘', '智联', '前程', '领英', '51']################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  # 读取文件目录def __init__ (self, file_dir):self.fullWord = []self.fullText = ""self.file_dir = file_dir if os.path.splitext(self.file_dir)[1] == ".pdf":pdf = pb.open(self.file_dir)for page in pdf.pages:self.fullWord += page.extract_words()self.fullText += page.extract_text() if page.extract_text() else ""pdf.close()# 功能函数 读取一个段落知道某一行的长度只有不到4位中文字符def __readUntil (text, length):return ""# 必要部分:姓名 应聘职位 专业 联系电话 附件下载 来源 性别# 01 搜索姓名函数  Namedef __search_Name (self):result = ""names = []full_text = self.fullText# 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )if (len(dir_Set) > 0):for TempDir in dir_Set: if (TempDir[0] in Surname_List):return TempDir# 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历for line in full_text.split("\n"):# 是否在姓名字段下if re.search(r"姓[ ]+名", line):name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]names.append(re.sub(r"[姓名::\s]", "", name))break# 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇else:   regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'nameset = re.findall (regex_str, line)if len(nameset): return nameset[0]names += nameset                     # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个for TmpName in names: if (TmpName[0] in Surname_List):result = TmpNamereturn result   return result# 02 搜索应聘职位  Jobsdef __search_Jobs (self):result = ""jobs = []full_text = self.fullTextJobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]for line in full_text.split("\n"):# 是否在职位字段下if any(title in line for title in JobTitle_List):for title in JobTitle_List:if re.search(title, line):job_List = re.findall(r"\s*" + title + "[::\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)if (len(job_List) > 0): job = job_List[0]job = re.sub(title + r"[::*\s]", "", job)job = re.sub("\s", "", job)jobs.append(job)return jobbreakreturn ""# 03 搜索专业函数  Majordef __search_Major (self):majors = []major = ""  result = ""full_text   = self.fullTextfull_words  = self.fullWord# 在专业字段中寻找for line in full_text.split("\n"):# 51 jobif re.search(r"专[ ]+业*", line):majorList = re.findall(r"专[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[专业::\s]", "", major))# 猎聘通if re.search(r"\s*行[ ]+业*", line):majorList = re.findall(r"\s*行[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[行业::\s]", "", major))for premajor in Major_List:if premajor in line:return premajorif (len(majors) > 0): if (len(majors[0]) > 0):return majors[0]# 在正文部分中寻找 带有专业或者系的字段for word in full_words:text = ""textMajor = ""if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]else: text = word# 中文专业 尴尬的事情是扫码联系if "专业" or "系" or "技术" in text:for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):if "专业" or "系" or "技术" in m:majors.append(m)textMajor = m;breakif textMajor != "": break# 英文专业 这一部分还需要修改elif "Bsc" or "Major" or "Msc" in text:for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):if "Bsc" or "Major" or "Msc" in m:majors.append(m)textMajor = m;breakif textMajor != "": breakif len(majors) > 0: for m in majors:if (len(m) == 0): continueresult = mreturn result  # 04 搜索电话信息# Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下def __search_Phone (self):# 找到含有11位数字的字符串段full_text = self.fullTextphone   = ""number  = ""number_List = []# 通过关键词查找  去除空格和短横线后 前后的小括号 读取 11 13 14 个连续的数字for line in full_text.split("\n"):if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):# 去除标点符号line = re.sub(r"[()()::+\-]", "", line)# 选择 11 到 15 位长度的数字number_List = re.findall(r"\d{11,15}", line)if (len(number_List) > 0): number = number_List[0]return number                    break# 直接通过数字长度查找 返回符合要求的集合if phone == "":text   = re.sub(r"[()()+\-]", "", full_text)phones = re.findall(r"\d{11,15}", text)phone  = ",".join(set(phones))return phone# 06 确认来源信息  Vendordef __search_Vendor (self):directory = self.file_dirfull_text = self.fullText# 在目录中寻找for vendor in Vendor_List:if vendor in self.file_dir:return vendor# 在字段中寻找count = 0for line in full_text.split("\n"):if (count > 20): breakfor vendor in Vendor_List:if vendor in line: return vendorcount += 1return ""# 07 搜索性别函数  Gender 没写男女就只能通过照片去判断def __search_Gender (self):gender = "" full_text  = self.fullTextfull_words  = self.fullWordcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 15): break# 性别字段if re.search(r"性[ ]+别*", line):gender = re.findall(r"性[ ]+别[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0]# 识别到男性字段if re.search(r"男", line) or re.search(r"Male", line): gender = "男"return gender# 识别到女性字段if re.search(r"女", line) or re.search(r"Female", line): gender = "女"return gendercounter += 1                                return gender# 可选部分:# 08 搜索年龄函数  Agedef __search_Age (self):Curr_Year = datetime.datetime.now().yearnumber = ""full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):# 获取出生年月if re.search(r"出生年月", line):number_List = re.findall(r"\d{4,4}", line)if (len(number_List) > 0): number = number_List[0]Age = Curr_Year - int(number)return str(Age) break# 获取岁if re.search(r"\s*岁", line):number_List = re.findall(r"\d{1,2}", line)if (len(number_List) > 0): number = number_List[0]return number  breakreturn ""# 09 判断在职状态  Conditiondef __search_Condition (self):full_text  = self.fullTextcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 20): breakif re.search(r"离职", line): return "离职"if re.search(r"正在找工作", line): return "正在找工作"if re.search(r"在职", line): return "在职"counter += 1                                return ""# 10 搜索城市函数  Citiesdef __search_City (self):locations = []location = ""  full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):if re.search(r"\s*地点", line):loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地点::\s]", "", location))breakif re.search(r"所在地", line) or re.search(r"现居地", line):loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地::\s]", "", location))breakif re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):# 住址loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[住址::\s]", "", location))break# 现居住loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[现居住::\s]", "", location))break# Locationloc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[Location::\s]", "", location))breakbreakif (len(locations) > 0): location = locations[0]return location# 11 搜索学历函数  Stagedef __search_Stage (self):stage = ""  full_text  = self.fullTextsetPhd = ["博士"]setMsc = ["硕士", "研究生"]setBsc = ["大学", "本科"]setByd = ["大专", "专科"]setOth = ["学院"]setSta = setPhd + setMsc + setBsc + setByd + setOth# 在学历字段中寻找for line in full_text.split("\n"):if (any (TempStr in line for TempStr in setSta)):if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"if stage != "": return stageif stage == "": return "专科"                    return stage# 12 搜索籍贯函数  Hometowndef __search_Hometown (self):hometown    = "" full_text   = self.fullTextfull_words  = self.fullWordcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 15): break# 籍贯字段if re.search(r"籍[ ]+贯*", line):hometown = re.findall(r"籍[ ]+贯[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0] return hometown# 13 搜索自我评价函数  Self-Commentdef __search_SelfComment (self):selfie    = "" '''full_text = self.fullWordcounter = 0print(full_text)# 在专业字段中寻找  turn = Falsefor line in full_text.split("\n"):        # 限制第十五行以内# if (counter < 10): continue# 籍贯字段if re.search  (r"自我评价", line):turn = Trueprint ("Yes" + self.file_dir)if (turn == True) and (len(line) > 10):print (line + "\n")'''return ""# 14 搜索工作经验函数   Working Experiencedef __search_WorkExperience (self):return ""# 15 搜索教育经历函数   Education Experiencedef __search_EducationExperience (self):return ""# 16 搜索学校函数       Schooldef __search_School (self):# 这个顺序有讲究的 一般 大学 校区 学院College_signs = ["大学", "校区", "学院"]Note_signs = ["毕业院校"]school = ""  school_list = []full_text  = self.fullTextpunctuation = '::|-'# 在地点字段中寻找for line in full_text.split("\n"):# 查看是否有相匹配的节点for term in Note_signs:if re.search(term, line):school_list += re.findall(r"[::\s]*[\u4e00-\u9fa5]{2,10}", line)# 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分for term in College_signs:if re.search(r"\s*"+term, line):line = re.sub('[{}]'.format(punctuation), " ", line)school_list += re.findall(r"\S{2,10}"+term, line)# 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大if (len(school_list) > 0):school = re.sub(r"\s", "", school_list[-1])return schoolbreakreturn ""# 17 搜索证书函数       Certificatedef __search_Certificate (self):return ""# 18 搜索专业技能函数   Skill-Setdef __search_ProfessionalSkills (self):return ""# 19 搜索期望薪资函数   Expected Salariesdef __search_Salary (self):salary = ""Note_signs = ["期望薪资"]salary_list = []full_text  = self.fullTextpunctuation = '::|-'# 在地点字段中寻找for line in full_text.split("\n"):# 查看是否有相匹配的节点for term in Note_signs:if re.search(term, line):school_list += re.findall(r"[::\s]*\S{2,10}", line)# 关键在 - 左右两边对称 多少到多少if re.search("/月", line):print(line)salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)# 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大if (len(salary_list) > 0):salary = re.sub("万", "0000", salary_list[-1])salary = re.sub(r"[\s ]", "", salary_list[-1])return salarybreakreturn ""# 20 搜索工作年限函数   Working Stagesdef __search_WorkYears (self):return ""# 21 搜索区号函数# 22 搜索邮箱函数  Emaildef __search_Email (self):# 找到含有 @ 和 . 的字符串段full_words = self.fullWordfull_text  = self.fullTextemail = ""email_List = []newEmail = ""# 先查看邮箱栏下是否有邮箱可以直接选用for line in full_text.split("\n"):if re.search(r"邮[ ]+箱", line):newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))if (len(email_List) > 0):for TempEmail in email_List:if '@' in TempEmail:email = email_List[0] return email# 再遍历所有的 word 寻找邮箱特殊的关键词for word in full_words:if os.path.splitext(self.file_dir)[1] == ".pdf":text = word["text"]else:text = wordif "@" in text and "." in text:for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):if "@" in e:email = ebreakif email != "": breakreturn email# 搜索技能函数  Search Skillsdef __search_Skill (self):Skills = []skill  = ""full_text  = self.fullTextfor line in full_text.split("\n"):key = ""for keyword in Skillset_List:            if re.search(keyword, line) and (key == ""):Skills.append(line)key = "Added"return Skills# 入口函数 返回搜索结果def search (self):# 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个sep_dir = re.split(r"/+|\\+", self.file_dir)directory = sep_dir[-1]file_name = sep_dir[-1]if len(sep_dir) > 1:directory = sep_dir[-2]info = {"Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""}func = {"user_name":    self.__search_Name(),           # 姓名"jobs":         self.__search_Jobs(),           # 应聘职位"major":        self.__search_Major(),          # 专业"phone":        self.__search_Phone(),          # 电话5:              directory,                      # 附件"vendor":       self.__search_Vendor(),         # 来源"gender":       self.__search_Gender(),         # 性别"age":          self.__search_Age(),            # 年龄"condition":    self.__search_Condition(),      # 状态"city":         self.__search_City(),           # 现居地"stage":        self.__search_Stage(),          # 学历"hometown":     self.__search_Hometown(),       # 籍贯"selfComment":  self.__search_SelfComment(),    # 自我评价14:     "",15:     "","school":       self.__search_School(),         # 学校17:     "",18:     "","salary":       self.__search_Salary(),20:     "",21:     "","email":        self.__search_Email(),          # 邮箱23:     "","skill":        self.__search_Skill(),          # 技能}for key in info:if (key == "Directory") or (key == "file_name"): continuetry:    info[key] = func[key]except Exception as e: print(e)continuereturn info################################################################################################
# 猎聘
# class Lie-Pin (object):################################################################################################
# 智联
# class Zhi-Lian (object):################################################################################################
# 前程无忧
# class Qian-Cheng (object):################################################################################################
# 51jobs
# class Jobs (object):################################################################################################
# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath################################################################################################
# 输出生成函数
class Generator (object):# 初始化def __init__ (self, sourceInfo):self.info = sourceInfo# 打印呈现def display (self):result = self.infoprint("################### Candidate ###################")# Necessary infoprint("Name     : ", result["user_name"])print("Position :", result["jobs"])print("Major    : ", result["major"])print("Phone    : ", result["phone"])print("Gender   : ", result["gender"])print("Source   : ", result["file_name"])print("Vendor   : ", result["vendor"])print("Condition: ", result["condition"])# Optional Infoprint("Email    : ", result["email"])print("City     : ", result["city"])print("Age      : ", result["age"])print("Stage    : ", result["stage"])  print("Hometown : ", result["hometown"])print("School   : ", result["school"])print("Salary   : ", result["salary"])# print("SkillSet : ", "\n".join(result["skill"]))print("\n\n\n")# 生成 Jsondef generate_Json (self):try:data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)except Exception as e: print(e)return data_Json################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):# 初始化def __init__ (self, sourceJson):self.source = sourceJson# 主要函数def process (self):sourceFile = "TestJson.json"# 链接 api 接口from pyDataverse.api import NativeApiapi = NativeApi(BASE_URL, API_TOKEN)# Create Collection of datafrom pyDataverse.models import Dataversefrom pyDataverse.utils import read_filedv = Dataverse()dv.from_json(read_file(sourceFile))resp = api.create_dataverse (":root", dv.json())resp = api.publish_dataverse ("Dataverse_Resumes")resp = api.get_dataverse ("Dataverse_Resumes")################################################################################################
# 杂项函数
class function:# 呈现百分比def displayPercent (counter, total, turn):assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))# 常规 display, turn == Trueif (turn):percent = float(counter)*100 / float(total)sys.stdout.write("%.4f"%percent);sys.stdout.write("%\r");sys.stdout.flush();# 最终 display, turn == Falseelse:sys.stdout.write("100%!finish!\n");sys.stdout.flush();return ""# Json 初始化def initiateJson (filename):assert (isinstance(filename, str) and (".json" in filename))Json_file = open(filename, 'w', encoding = 'utf-8')Json_file.seek(0)       # 定位到 Position 0Json_file.truncate()    # 清空 Json 文件return Json_file################################################################################################
# 主函数
if __name__ == "__main__":# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")ResumeInfoList = []# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter   = 0Json_file = function.initiateJson ("resume_Result.json")total     = len(PdfResumePath)#         导出简历信息for file in PdfResumePath:counter = counter + 1# if (counter > 2): continueResumeInfoList.append (Extractor(file_dir = file).search())function.displayPercent (counter, total, True)function.displayPercent (counter, total, False)counter = 0Json_file.write("[\n")length = len(ResumeInfoList)#         将信息呈现并写入jsonfor info in ResumeInfoList:counter = counter + 1Generator(sourceInfo = info).display()Result_Json = Generator(sourceInfo = info).generate_Json()Json_file.write(Result_Json)if (counter != length): Json_file.write(",")Json_file.write("\n")Json_file.write("]")Json_file.close()# 复制到仓库中# Step 3: 导出到 dataverse# dataverse_Publish(sourceJson = Json_filename).process()# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2################################################################################################
# 函数 读取信息
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)
# coding: utf-8# 结果导出到 result.json文件夹内import os
from   xml.dom.minidom import parse
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
import sysPdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录# 转换器
class Transformer (object):def __init__ (self, file_dir, quitD):self.fulltext = ""self.fileDir = file_dir# 打开pdf文件if os.path.splitext(self.fileDir)[1] == ".pdf":pdf = pb.open(self.fileDir)# 从每一页中读取出需要的内容for page in pdf.pages:self.fulltext += page.extract_text() if page.extract_text() else ""pdf.close()# 去重汉字if (quitD == 1):self.fulltext = self.quitDuplicate()self.info = {"File_name": self.fileDir, "Text": self.fulltext}def quitDuplicate (self):full_text = self.fulltextnew_text = ""for line in full_text.split("\n"):newline = function.quitDuplicate(line) + '\n'new_text += newlinereturn new_textdef generate_Json (self):try:data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)except Exception as e: print(e)return data_Json# 功能函数
class function:# 呈现百分比def displayPercent (counter, total, turn):assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))# 常规 display, turn == Trueif (turn):percent = float(counter)*100 / float(total)sys.stdout.write("%.4f"%percent);sys.stdout.write("%\r");sys.stdout.flush();# 最终 display, turn == Falseelse:sys.stdout.write("100%!finish!\n");sys.stdout.flush();return ""# Initialize Json filedef initiateJson (filename):assert (isinstance(filename, str) and (".json" in filename))Json_file = open(filename, 'w', encoding = 'utf-8')Json_file.seek(0)       # 定位到 Position 0Json_file.truncate()    # 清空 Json 文件return Json_file# 去除重复的字符def quitDuplicate (source):# return sourcecounter = 1while (counter < len(source)):if (source[counter] == source[counter - 1]):# 额外需要增加的功能 是否是名字的判断if '\u4e00' <= source[counter] <= '\u9fff':oldstr = sourcenewstr = oldstr[:counter] + "" + oldstr[counter + 1:]source = newstrcounter = counter + 1return source# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath# main function
def operater():FolderPath = r"respo"  quitD = int(input('Quit Duplicate?, if yes input 1\n'))# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter   = 0Json_file = function.initiateJson ("Result.json")total     = len(PdfResumePath)# 导出简历信息for file in PdfResumePath:counter = counter + 1# Write in filesResult_Json = Transformer(file_dir = file, quitD = quitD).generate_Json()Json_file.write (Result_Json + "\n")# Show percentage displayfunction.displayPercent (counter, total, True)Json_file.close()function.displayPercent (counter, total, False)counter = 0
# coding: utf-8# 结果导出到 result.json文件夹内import os
from   xml.dom.minidom import parse
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
import sysPdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录# 转换器
class Transformer (object):def __init__ (self, file_dir, quitD):self.fulltext = ""self.fileDir = file_dir# 打开pdf文件if os.path.splitext(self.fileDir)[1] == ".pdf":pdf = pb.open(self.fileDir)# 从每一页中读取出需要的内容for page in pdf.pages:self.fulltext += page.extract_text() if page.extract_text() else ""pdf.close()# 去重汉字if (quitD == 1):self.fulltext = self.quitDuplicate()self.info = {"File_name": self.fileDir, "Text": self.fulltext}def quitDuplicate (self):full_text = self.fulltextnew_text = ""for line in full_text.split("\n"):newline = function.quitDuplicate(line) + '\n'new_text += newlinereturn new_textdef generate_Json (self):try:data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)except Exception as e: print(e)return data_Json# 功能函数
class function:# 呈现百分比def displayPercent (counter, total, turn):assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))# 常规 display, turn == Trueif (turn):percent = float(counter)*100 / float(total)sys.stdout.write("%.4f"%percent);sys.stdout.write("%\r");sys.stdout.flush();# 最终 display, turn == Falseelse:sys.stdout.write("100%!finish!\n");sys.stdout.flush();return ""# Initialize Json filedef initiateJson (filename):assert (isinstance(filename, str) and (".json" in filename))Json_file = open(filename, 'w', encoding = 'utf-8')Json_file.seek(0)       # 定位到 Position 0Json_file.truncate()    # 清空 Json 文件return Json_file# 去除重复的字符def quitDuplicate (source):# return sourcecounter = 1while (counter < len(source)):if (source[counter] == source[counter - 1]):# 额外需要增加的功能 是否是名字的判断if '\u4e00' <= source[counter] <= '\u9fff':oldstr = sourcenewstr = oldstr[:counter] + "" + oldstr[counter + 1:]source = newstrcounter = counter + 1return source# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath# main function
def operater():FolderPath = r"respo"  quitD = int(input('Quit Duplicate?, if yes input 1\n'))# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter   = 0Json_file = function.initiateJson ("Result.json")total     = len(PdfResumePath)# 导出简历信息for file in PdfResumePath:counter = counter + 1# Write in filesResult_Json = Transformer(file_dir = file, quitD = quitD).generate_Json()Json_file.write (Result_Json + "\n")# Show percentage displayfunction.displayPercent (counter, total, True)Json_file.close()function.displayPercent (counter, total, False)counter = 0
#!/usr/bin/env python
# encoding: utf-8# -*- coding: utf-8 -*-
# @contact: ybsdeyx@foxmail.com
# @software: PyCharm
# @time: 2019/4/25 16:39
# @author: Paulson●Wier
# @file: captcha_qq.py
# @desc:
import numpy as np
import randomimport requests
from selenium.webdriver import ActionChains
import time
from selenium import webdriver
from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
import cv2class Login(object):"""腾讯防水墙滑动验证码破解使用OpenCV库成功率大概90%左右:在实际应用中,登录后可判断当前页面是否有登录成功才会出现的信息:比如用户名等。循环https://open.captcha.qq.com/online.html破解 腾讯滑动验证码腾讯防水墙python + seleniuum + cv2"""def __init__(self):# 如果是实际应用中,可在此处账号和密码self.url = "https://open.captcha.qq.com/online.html"self.driver = webdriver.Chrome(r"C:/Users/E112434/Downloads/chromedriver.exe")@staticmethoddef show(name):cv2.imshow('Show', name)cv2.waitKey(0)cv2.destroyAllWindows()@staticmethoddef webdriverwait_send_keys(dri, element, value):"""显示等待输入:param dri: driver:param element::param value::return:"""WebDriverWait(dri, 10, 5).until(lambda dr: element).send_keys(value)@staticmethoddef webdriverwait_click(dri, element):"""显示等待 click:param dri: driver:param element::return:"""WebDriverWait(dri, 10, 5).until(lambda dr: element).click()@staticmethoddef get_postion(chunk, canves):"""判断缺口位置:param chunk: 缺口图片是原图:param canves::return: 位置 x, y"""otemp = chunkoblk = canvestarget = cv2.imread(otemp, 0)template = cv2.imread(oblk, 0)# w, h = target.shape[::-1]temp = 'temp.jpg'targ = 'targ.jpg'cv2.imwrite(temp, template)cv2.imwrite(targ, target)target = cv2.imread(targ)target = cv2.cvtColor(target, cv2.COLOR_BGR2GRAY)target = abs(255 - target)cv2.imwrite(targ, target)target = cv2.imread(targ)template = cv2.imread(temp)result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED)x, y = np.unravel_index(result.argmax(), result.shape)return x, y# # 展示圈出来的区域# cv2.rectangle(template, (y, x), (y + w, x + h), (7, 249, 151), 2)# cv2.imwrite("yuantu.jpg", template)# show(template)@staticmethoddef get_track(distance):"""模拟轨迹 假装是人在操作:param distance::return:"""# 初速度v = 0# 单位时间为0.2s来统计轨迹,轨迹即0.2内的位移t = 0.2# 位移/轨迹列表,列表内的一个元素代表0.2s的位移tracks = []# 当前的位移current = 0# 到达mid值开始减速mid = distance * 7 / 8distance += 10  # 先滑过一点,最后再反着滑动回来# a = random.randint(1,3)while current < distance:if current < mid:# 加速度越小,单位时间的位移越小,模拟的轨迹就越多越详细a = random.randint(2, 4)  # 加速运动else:a = -random.randint(3, 5)  # 减速运动# 初速度v0 = v# 0.2秒时间内的位移s = v0 * t + 0.5 * a * (t ** 2)# 当前的位置current += s# 添加到轨迹列表tracks.append(round(s))# 速度已经达到v,该速度作为下次的初速度v = v0 + a * t# 反着滑动到大概准确位置for i in range(4):tracks.append(-random.randint(2, 3))for i in range(4):tracks.append(-random.randint(1, 3))return tracks@staticmethoddef urllib_download(imgurl, imgsavepath):"""下载图片:param imgurl: 图片url:param imgsavepath: 存放地址:return:"""from urllib.request import urlretrieveurlretrieve(imgurl, imgsavepath)def after_quit(self):"""关闭浏览器:return:"""self.driver.quit()def login_main(self):# ssl._create_default_https_context = ssl._create_unverified_contextdriver = self.driverdriver.maximize_window()driver.get(self.url)click_keyi_username = driver.find_element_by_xpath("//div[@class='wp-onb-tit']/a[text()='可疑用户']")self.webdriverwait_click(driver, click_keyi_username)login_button = driver.find_element_by_id('code')self.webdriverwait_click(driver, login_button)time.sleep(1)driver.switch_to.frame(driver.find_element_by_id('tcaptcha_iframe'))  # switch 到 滑块frametime.sleep(0.5)bk_block = driver.find_element_by_xpath('//img[@id="slideBg"]')  # 大图web_image_width = bk_block.sizeweb_image_width = web_image_width['width']bk_block_x = bk_block.location['x']slide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]')  # 小滑块slide_block_x = slide_block.location['x']bk_block = driver.find_element_by_xpath('//img[@id="slideBg"]').get_attribute('src')       # 大图 urlslide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]').get_attribute('src')  # 小滑块 图片urlslid_ing = driver.find_element_by_xpath('//div[@id="tcaptcha_drag_thumb"]')  # 滑块os.makedirs('./image/', exist_ok=True)self.urllib_download(bk_block, './image/bkBlock.png')self.urllib_download(slide_block, './image/slideBlock.png')time.sleep(0.5)img_bkblock = Image.open('./image/bkBlock.png')real_width = img_bkblock.size[0]width_scale = float(real_width) / float(web_image_width)position = self.get_postion('./image/bkBlock.png', './image/slideBlock.png')real_position = position[1] / width_scalereal_position = real_position - (slide_block_x - bk_block_x)track_list = self.get_track(real_position + 4)ActionChains(driver).click_and_hold(on_element=slid_ing).perform()  # 点击鼠标左键,按住不放time.sleep(0.2)# print('第二步,拖动元素')for track in track_list:ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)time.sleep(0.002)# ActionChains(driver).move_by_offset(xoffset=-random.randint(0, 1), yoffset=0).perform()   # 微调,根据实际情况微调time.sleep(1)# print('第三步,释放鼠标')ActionChains(driver).release(on_element=slid_ing).perform()time.sleep(1)print('登录成功')#self.after_quit()if __name__ == '__main__':phone = "****"login = Login()login.login_main()
# coding:utf-8
# 这是jieba的测试文件 jieba 用于中文分词
# docx tryfrom docx import Document
from docx.shared import Inches
import os
import jieba
import redocument = Document()  # 首先这是包的主要接口,这应该是利用的设计模式的一种,用来创建docx文档,里面也可以包含文档路径(d:\\2.docx)document.add_heading('Document Title', 0)  # 这里是给文档添加一个标题,0表示 样式为title,1则为忽略,其他则是Heading{level},具体可以去<a href="https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html" target="_blank">官网</a>查;p = document.add_paragraph('A plain paragraph having some ') # 这里是添加一个段落
p.add_run('bold').bold = True   # 这里是在这个段落p里文字some后面添加bold字符
p.add_run(' and some ')
p.add_run('italic.').italic = True  document.add_heading('Heading, level 1', level=1)   # 这里是添加标题1
document.add_page_break()                           # 添加分页符
document.save('demo.docx')                          # 保存这个文档pattern = '(.*)专业'
string  = '崔源的专业是 计算机科学专业。'Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','池','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']Surname_Dict = dict(zip(Surname_List, range(len(Surname_List))))
TmpStr = "傻逼秋"
# print (Surname_Dict.get(TmpStr[0]) > 0)# jieba.lcut 可以将字符串分解为字段 输出结果为 ['liucuiyuan3321', '@', 'outlook', '.', 'com']
print(jieba.lcut(string, cut_all = True))result = re.findall(r'[。][^。]*[。]'.format("专业"), string)
print(result)
s = '^'+'|'.join(Surname_Dict)
print(s)
print((TmpStr[0] in Surname_List))# coding:utf-8
# 目前还缺乏研究 如果有多个专业应该怎么处理
# 多种方式比对# 信息 先分块 后解析 准确率和效率提升import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
import sys
# import provinces################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\CV_Automation\ResumeRespo"  PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构################################################################################################
# 参考集 字典# 个人筛选的 200 个常用姓氏 生成字典
Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}
# 专业
Major_List = ['软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术']# 技能
Skillset_List = ['Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目']# 地点
Location_List = ['成都', '广州']# 来源
Vendor_List = ['猎聘', '智联', '前程', '领英', '51']################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  # 读取文件目录def __init__ (self, file_dir):self.fullWord = []self.fullText = ""self.file_dir = file_dir if os.path.splitext(self.file_dir)[1] == ".pdf":pdf = pb.open(self.file_dir)for page in pdf.pages:self.fullWord += page.extract_words()self.fullText += page.extract_text() if page.extract_text() else ""pdf.close()# 功能函数 读取一个段落知道某一行的长度只有不到4位中文字符def __readUntil (text, length):return ""# 必要部分:姓名 应聘职位 专业 联系电话 附件下载 来源 性别# 01 搜索姓名函数  Namedef __search_Name (self):result = ""names = []full_text = self.fullText# 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )if (len(dir_Set) > 0):for TempDir in dir_Set: if (TempDir[0] in Surname_List):return TempDir# 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历for line in full_text.split("\n"):# 是否在姓名字段下if re.search(r"姓[ ]+名", line):name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]names.append(re.sub(r"[姓名::\s]", "", name))break# 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇else:   regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'nameset = re.findall (regex_str, line)if len(nameset): return nameset[0]names += nameset                     # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个for TmpName in names: if (TmpName[0] in Surname_List):result = TmpNamereturn result   return result# 02 搜索应聘职位  Jobsdef __search_Jobs (self):result = ""jobs = []full_text = self.fullTextJobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]for line in full_text.split("\n"):# 是否在职位字段下if any(title in line for title in JobTitle_List):for title in JobTitle_List:if re.search(title, line):job_List = re.findall(r"\s*" + title + "[::\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)if (len(job_List) > 0): job = job_List[0]job = re.sub(title + r"[::*\s]", "", job)job = re.sub("\s", "", job)jobs.append(job)return jobbreakreturn ""# 03 搜索专业函数  Majordef __search_Major (self):majors = []major = ""  result = ""full_text   = self.fullTextfull_words  = self.fullWord# 在专业字段中寻找for line in full_text.split("\n"):# 51 jobif re.search(r"专[ ]+业*", line):majorList = re.findall(r"专[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[专业::\s]", "", major))# 猎聘通if re.search(r"\s*行[ ]+业*", line):majorList = re.findall(r"\s*行[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[行业::\s]", "", major))for premajor in Major_List:if premajor in line:return premajorif (len(majors) > 0): if (len(majors[0]) > 0):return majors[0]# 在正文部分中寻找 带有专业或者系的字段for word in full_words:text = ""textMajor = ""if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]else: text = word# 中文专业 尴尬的事情是扫码联系if "专业" or "系" or "技术" in text:for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):if "专业" or "系" or "技术" in m:majors.append(m)textMajor = m;breakif textMajor != "": break# 英文专业 这一部分还需要修改elif "Bsc" or "Major" or "Msc" in text:for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):if "Bsc" or "Major" or "Msc" in m:majors.append(m)textMajor = m;breakif textMajor != "": breakif len(majors) > 0: for m in majors:if (len(m) == 0): continueresult = mreturn result  # 04 搜索电话信息# Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下def __search_Phone (self):# 找到含有11位数字的字符串段full_text = self.fullTextphone   = ""number  = ""number_List = []# 通过关键词查找  去除空格和短横线后 前后的小括号 读取 11 13 14 个连续的数字for line in full_text.split("\n"):if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):# 去除标点符号line = re.sub(r"[()()::+\-]", "", line)# 选择 11 到 15 位长度的数字number_List = re.findall(r"\d{11,15}", line)if (len(number_List) > 0): number = number_List[0]return number                    break# 直接通过数字长度查找 返回符合要求的集合if phone == "":text   = re.sub(r"[()()+\-]", "", full_text)phones = re.findall(r"\d{11,15}", text)phone  = ",".join(set(phones))return phone# 06 确认来源信息  Vendordef __search_Vendor (self):directory = self.file_dirfull_text = self.fullText# 在目录中寻找for vendor in Vendor_List:if vendor in self.file_dir:return vendor# 在字段中寻找count = 0for line in full_text.split("\n"):if (count > 20): breakfor vendor in Vendor_List:if vendor in line: return vendorcount += 1return ""# 07 搜索性别函数  Gender 没写男女就只能通过照片去判断def __search_Gender (self):gender = "" full_text  = self.fullTextfull_words  = self.fullWordcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 15): break# 性别字段if re.search(r"性[ ]+别*", line):gender = re.findall(r"性[ ]+别[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0]# 识别到男性字段if re.search(r"男", line) or re.search(r"Male", line): gender = "男"return gender# 识别到女性字段if re.search(r"女", line) or re.search(r"Female", line): gender = "女"return gendercounter += 1                                return gender# 可选部分:# 08 搜索年龄函数  Agedef __search_Age (self):Curr_Year = datetime.datetime.now().yearnumber = ""full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):# 获取出生年月if re.search(r"出生年月", line):number_List = re.findall(r"\d{4,4}", line)if (len(number_List) > 0): number = number_List[0]Age = Curr_Year - int(number)return str(Age) break# 获取岁if re.search(r"\s*岁", line):number_List = re.findall(r"\d{1,2}", line)if (len(number_List) > 0): number = number_List[0]return number  breakreturn ""# 09 判断在职状态  Conditiondef __search_Condition (self):full_text  = self.fullTextcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 20): breakif re.search(r"离职", line): return "离职"if re.search(r"正在找工作", line): return "正在找工作"if re.search(r"在职", line): return "在职"counter += 1                                return ""# 10 搜索城市函数  Citiesdef __search_City (self):locations = []location = ""  full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):if re.search(r"\s*地点", line):loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地点::\s]", "", location))breakif re.search(r"所在地", line) or re.search(r"现居地", line):loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地::\s]", "", location))breakif re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):# 住址loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[住址::\s]", "", location))break# 现居住loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[现居住::\s]", "", location))break# Locationloc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[Location::\s]", "", location))breakbreakif (len(locations) > 0): location = locations[0]return location# 11 搜索学历函数  Stagedef __search_Stage (self):stage = ""  full_text  = self.fullTextsetPhd = ["博士"]setMsc = ["硕士", "研究生"]setBsc = ["大学", "本科"]setByd = ["大专", "专科"]setOth = ["学院"]setSta = setPhd + setMsc + setBsc + setByd + setOth# 在学历字段中寻找for line in full_text.split("\n"):if (any (TempStr in line for TempStr in setSta)):if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"if stage != "": return stageif stage == "": return "专科"                    return stage# 12 搜索籍贯函数  Hometowndef __search_Hometown (self):hometown    = "" full_text   = self.fullTextfull_words  = self.fullWordcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 15): break# 籍贯字段if re.search(r"籍[ ]+贯*", line):hometown = re.findall(r"籍[ ]+贯[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0] return hometown# 13 搜索自我评价函数  Self-Commentdef __search_SelfComment (self):selfie    = "" '''full_text = self.fullWordcounter = 0print(full_text)# 在专业字段中寻找  turn = Falsefor line in full_text.split("\n"):        # 限制第十五行以内# if (counter < 10): continue# 籍贯字段if re.search  (r"自我评价", line):turn = Trueprint ("Yes" + self.file_dir)if (turn == True) and (len(line) > 10):print (line + "\n")'''return ""# 14 搜索工作经验函数   Working Experiencedef __search_WorkExperience (self):return ""# 15 搜索教育经历函数   Education Experiencedef __search_EducationExperience (self):return ""# 16 搜索学校函数       Schooldef __search_School (self):# 这个顺序有讲究的 一般 大学 校区 学院College_signs = ["大学", "校区", "学院"]Note_signs = ["毕业院校"]school = ""  school_list = []full_text  = self.fullTextpunctuation = '::|-'# 在地点字段中寻找for line in full_text.split("\n"):# 查看是否有相匹配的节点for term in Note_signs:if re.search(term, line):school_list += re.findall(r"[::\s]*[\u4e00-\u9fa5]{2,10}", line)# 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分for term in College_signs:if re.search(r"\s*"+term, line):line = re.sub('[{}]'.format(punctuation), " ", line)school_list += re.findall(r"\S{2,10}"+term, line)# 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大if (len(school_list) > 0):school = re.sub(r"\s", "", school_list[-1])return schoolbreakreturn ""# 17 搜索证书函数       Certificatedef __search_Certificate (self):return ""# 18 搜索专业技能函数   Skill-Setdef __search_ProfessionalSkills (self):return ""# 19 搜索期望薪资函数   Expected Salariesdef __search_Salary (self):salary = ""Note_signs = ["期望薪资"]salary_list = []full_text  = self.fullTextpunctuation = '::|-'# 在地点字段中寻找for line in full_text.split("\n"):# 查看是否有相匹配的节点for term in Note_signs:if re.search(term, line):school_list += re.findall(r"[::\s]*\S{2,10}", line)# 关键在 - 左右两边对称 多少到多少if re.search("/月", line):print(line)salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)# 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大if (len(salary_list) > 0):salary = re.sub("万", "0000", salary_list[-1])salary = re.sub(r"[\s ]", "", salary_list[-1])return salarybreakreturn ""# 20 搜索工作年限函数   Working Stagesdef __search_WorkYears (self):return ""# 21 搜索区号函数# 22 搜索邮箱函数  Emaildef __search_Email (self):# 找到含有 @ 和 . 的字符串段full_words = self.fullWordfull_text  = self.fullTextemail = ""email_List = []newEmail = ""# 先查看邮箱栏下是否有邮箱可以直接选用for line in full_text.split("\n"):if re.search(r"邮[ ]+箱", line):newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))if (len(email_List) > 0):for TempEmail in email_List:if '@' in TempEmail:email = email_List[0] return email# 再遍历所有的 word 寻找邮箱特殊的关键词for word in full_words:if os.path.splitext(self.file_dir)[1] == ".pdf":text = word["text"]else:text = wordif "@" in text and "." in text:for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):if "@" in e:email = ebreakif email != "": breakreturn email# 搜索技能函数  Search Skillsdef __search_Skill (self):Skills = []skill  = ""full_text  = self.fullTextfor line in full_text.split("\n"):key = ""for keyword in Skillset_List:            if re.search(keyword, line) and (key == ""):Skills.append(line)key = "Added"return Skills# 入口函数 返回搜索结果def search (self):# 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个sep_dir = re.split(r"/+|\\+", self.file_dir)directory = sep_dir[-2]file_name = sep_dir[-1]info = {"Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""}func = {"user_name":    self.__search_Name(),           # 姓名"jobs":         self.__search_Jobs(),           # 应聘职位"major":        self.__search_Major(),          # 专业"phone":        self.__search_Phone(),          # 电话5:              directory,                      # 附件"vendor":       self.__search_Vendor(),         # 来源"gender":       self.__search_Gender(),         # 性别"age":          self.__search_Age(),            # 年龄"condition":    self.__search_Condition(),      # 状态"city":         self.__search_City(),           # 现居地"stage":        self.__search_Stage(),          # 学历"hometown":     self.__search_Hometown(),       # 籍贯"selfComment":  self.__search_SelfComment(),    # 自我评价14:     "",15:     "","school":       self.__search_School(),         # 学校17:     "",18:     "","salary":       self.__search_Salary(),20:     "",21:     "","email":        self.__search_Email(),          # 邮箱23:     "","skill":        self.__search_Skill(),          # 技能}for key in info:if (key == "Directory") or (key == "file_name"): continuetry:    info[key] = func[key]except Exception as e: print(e)continuereturn info################################################################################################
# 猎聘
# class Lie-Pin (object):################################################################################################
# 智联
# class Zhi-Lian (object):################################################################################################
# 前程无忧
# class Qian-Cheng (object):################################################################################################
# 51jobs
# class Jobs (object):################################################################################################
# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath################################################################################################
# 输出生成函数
class Generator (object):# 初始化def __init__ (self, sourceInfo):self.info = sourceInfo# 打印呈现def display (self):result = self.infoprint("################### Candidate ", counter, " ###################")# Necessary infoprint("Name     : ", result["user_name"])print("Position :", result["jobs"])print("Major    : ", result["major"])print("Phone    : ", result["phone"])print("Gender   : ", result["gender"])print("Source   : ", result["file_name"])print("Vendor   : ", result["vendor"])print("Condition: ", result["condition"])# Optional Infoprint("Email    : ", result["email"])print("City     : ", result["city"])print("Age      : ", result["age"])print("Stage    : ", result["stage"])  print("Hometown : ", result["hometown"])print("School   : ", result["school"])print("Salary   : ", result["salary"])# print("SkillSet : ", "\n".join(result["skill"]))print("\n\n\n")# 生成 Jsondef generate_Json (self):try:data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)except Exception as e: print(e)return data_Json################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):# 初始化def __init__ (self, sourceJson):self.source = sourceJson# 主要函数def process (self):sourceFile = "TestJson.json"# 链接 api 接口from pyDataverse.api import NativeApiapi = NativeApi(BASE_URL, API_TOKEN)# Create Collection of datafrom pyDataverse.models import Dataversefrom pyDataverse.utils import read_filedv = Dataverse()dv.from_json(read_file(sourceFile))resp = api.create_dataverse (":root", dv.json())resp = api.publish_dataverse ("Dataverse_Resumes")resp = api.get_dataverse ("Dataverse_Resumes")################################################################################################
# 杂项函数
class function:# 呈现百分比def displayPercent (counter, total, turn):assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))# 常规 display, turn == Trueif (turn):percent = float(counter)*100 / float(total)sys.stdout.write("%.4f"%percent);sys.stdout.write("%\r");sys.stdout.flush();# 最终 display, turn == Falseelse:sys.stdout.write("100%!finish!\n");sys.stdout.flush();return ""# Json 初始化def initiateJson (filename):assert (isinstance(filename, str) and (".json" in filename))Json_file = open(filename, 'w', encoding = 'utf-8')Json_file.seek(0)       # 定位到 Position 0Json_file.truncate()    # 清空 Json 文件return Json_file################################################################################################
# 主函数
if __name__ == "__main__":# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")ResumeInfoList = []# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter   = 0Json_file = function.initiateJson ("resume_Result.json")total     = len(PdfResumePath)#         导出简历信息for file in PdfResumePath:counter = counter + 1# if (counter > 2): continueResumeInfoList.append (Extractor(file_dir = file).search())function.displayPercent (counter, total, True)function.displayPercent (counter, total, False)counter = 0Json_file.write("[\n")length = len(ResumeInfoList)#         将信息呈现并写入jsonfor info in ResumeInfoList:counter = counter + 1Generator(sourceInfo = info).display()Result_Json = Generator(sourceInfo = info).generate_Json()Json_file.write(Result_Json)if (counter != length): Json_file.write(",")Json_file.write("\n")Json_file.write("]")Json_file.close()# 复制到仓库中# Step 3: 导出到 dataverse# dataverse_Publish(sourceJson = Json_filename).process()# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2################################################################################################
# 函数 读取信息
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)# coding:utf-8
# 这个版本是用于简历分栏
'''
新的思路是 我们遍历每一个 text 的内容 然后看是否读取到这个 text 的长度只有4个字长
遍历是不是在分隔符的集合内 如果是 就在这里分割如果用表格抽取 好像只有邓的简历可以用这样的方法
'''import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
import sysfrom collections import OrderedDict# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  PdfResumePath = []              # 符合要求的 pdf  简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  # 读取文件信息 输出的 ansList 将我们分解出来的段落放置在一个数组中记录起来def __init__ (self, file_dir):self.fullWord = []self.fullText = ""self.file_dir = file_dir self.ansList  = []if os.path.splitext(self.file_dir)[1] == ".pdf":pdf = pb.open(self.file_dir)for page in pdf.pages:# 读取词汇self.fullWord += page.extract_words()# 读取文本信息self.fullText += page.extract_text() if page.extract_text() else ""# 读取表格pdf.close()# 切片函数def slide (self):pattern_list = ["信息", "评价", "经历", "经验", "信息", "技能", "意向"]full_text = self.fullTextsource_list = full_text.split('\n')paragraph_list = []# [\u4e00-\u9fa5\S*\u4e00-\u9fa5]{4, 6}# 依照段落表示分段counter = 0length  = len(source_list)currentText  = ""paragraph = []while (counter < length):# 导出并去除当前句子重复的汉字部分line = source_list[counter]line = function.quitDuplicate(line)# print(line)# 判断是否符合分割条件 如果符合 则新建一个段落存储 长度小于5 并 包含关键词if (len(line) < 5):# 将前一段文本导进if (currentText) : paragraph.append(currentText)# 判断是否有分割关键词if (any (TempStr in line for TempStr in pattern_list)):old_paragraph = paragraphparagraph_list.append(old_paragraph)paragraph = []paragraph.append(line)# 递进counter = counter + 1currentText = ""continuecurrentText += line + "\n"# paragraph.append(line)    这一步可以改为增加文本作为列表的元素counter = counter + 1# 加入最后一段paragraph.append (currentText)paragraph_list.append (paragraph)# 将段落列表返回return paragraph_list# 入口函数 返回搜索结果def search (self):# 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个sep_dir = re.split(r"/+|\\+", self.file_dir)directory = sep_dir[-2]file_name = sep_dir[-1]self.slide();info = {"Directory": directory, "file_name": file_name, "para": self.ansList}return info################################################################################################
# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath################################################################################################
# 输出生成函数
class Generator (object):# 初始化def __init__ (self, sourceInfo):self.info = sourceInfo# 打印呈现def display (self):result = self.infoprint("Length: " + (str)(len(info["para"])))print("################### Candidate ", counter, " ###################")# print paragraphsprint(info["para"])# print("SkillSet : ", "\n".join(result["skill"]))print("\n\n\n")################################################################################################
# 杂项函数
class function:# 呈现百分比def displayPercent (counter, total, turn):assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))# 常规 display, turn == Trueif (turn):percent = float(counter)*100 / float(total)sys.stdout.write("%.4f"%percent);sys.stdout.write("%\r");sys.stdout.flush();# 最终 display, turn == Falseelse:sys.stdout.write("100%!finish!\n");sys.stdout.flush();return ""# 去除重复的字符def quitDuplicate (source):# return sourcecounter = 1while (counter < len(source)):if (source[counter] == source[counter - 1]):# 额外需要增加的功能 是否是名字的判断if '\u4e00' <= source[counter] <= '\u9fff':oldstr = sourcenewstr = oldstr[:counter] + "" + oldstr[counter + 1:]source = newstrcounter = counter + 1return source################################################################################################
# 主函数
if __name__ == "__main__":# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")ResumeInfoList = []# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter   = 0total     = len(PdfResumePath)ResumeInfoList = []#         导出简历信息for file in PdfResumePath:counter = counter + 1# if (counter > 1): continueResumeInfoList.append (Extractor(file_dir = file).search())function.displayPercent (counter, total, True)function.displayPercent (counter, total, False)counter = 0    #         将信息呈现并写入jsonfor info in ResumeInfoList:counter = counter + 1Generator(sourceInfo = info).display()# coding:utf-8
# 版本04 试验对简历信息的分栏处理import os
import re
from xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb
import sys
import datetime
import pyDataverse as pd
import json
# import provinces################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构################################################################################################
# 参考集 字典# 个人筛选的 200 个常用姓氏 生成字典Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}# 专业Major_List = ['软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术']# 技能Skillset_List = ['Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目']# 地点Location_List = ['成都', '广州']# 来源Vendor_List = ['猎聘', '智联', '前程', '领英', '51']################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  # 读取文件目录def __init__ (self, file_dir):self.fullWord = []self.fullText = ""self.file_dir = file_dir if os.path.splitext(self.file_dir)[1] == ".pdf":pdf = pb.open(self.file_dir)for page in pdf.pages:self.fullWord += page.extract_words()self.fullText += page.extract_text() if page.extract_text() else ""pdf.close()# 必要部分:姓名 应聘职位 专业 联系电话 附件下载 来源 性别# 01 搜索姓名函数  Namedef __search_Name (self):result = ""names = []full_text = self.fullText# 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )if (len(dir_Set) > 0):for TempDir in dir_Set: if (TempDir[0] in Surname_List):return TempDir# 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历for line in full_text.split("\n"):# 是否在姓名字段下if re.search(r"姓[ ]+名", line):name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]names.append(re.sub(r"[姓名::\s]", "", name))break# 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇else:   regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'nameset = re.findall (regex_str, line)if len(nameset): return nameset[0]names += nameset                     # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个for TmpName in names: if (TmpName[0] in Surname_List):result = TmpNamereturn result   return result# 02 搜索应聘职位  Jobsdef __search_Jobs (self):result = ""jobs = []full_text = self.fullTextJobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]for line in full_text.split("\n"):# 是否在职位字段下if any(title in line for title in JobTitle_List):for title in JobTitle_List:if re.search(title, line):job_List = re.findall(r"\s*" + title + "[::\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)if (len(job_List) > 0): job = job_List[0]job = re.sub(title + r"[::*\s]", "", job)job = re.sub("\s", "", job)jobs.append(job)return jobbreakreturn ""# 03 搜索专业函数  Majordef __search_Major (self):majors = []major = ""  result = ""full_text   = self.fullTextfull_words  = self.fullWord# 在专业字段中寻找for line in full_text.split("\n"):# 51 jobif re.search(r"专[ ]+业*", line):majorList = re.findall(r"专[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[专业::\s]", "", major))# 猎聘通if re.search(r"\s*行[ ]+业*", line):majorList = re.findall(r"\s*行[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)if (len(majorList) > 0): major = majorList[0]majors.append(re.sub(r"[行业::\s]", "", major))for premajor in Major_List:if premajor in line:return premajorif (len(majors) > 0): if (len(majors[0]) > 0):return majors[0]# 在正文部分中寻找 带有专业或者系的字段for word in full_words:text = ""textMajor = ""if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]else: text = word# 中文专业 尴尬的事情是扫码联系if "专业" or "系" or "技术" in text:for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):if "专业" or "系" or "技术" in m:majors.append(m)textMajor = m;breakif textMajor != "": break# 英文专业 这一部分还需要修改elif "Bsc" or "Major" or "Msc" in text:for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):if "Bsc" or "Major" or "Msc" in m:majors.append(m)textMajor = m;breakif textMajor != "": breakif len(majors) > 0: for m in majors:if (len(m) == 0): continueresult = mreturn result  # 04 搜索电话信息# Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下def __search_Phone (self):# 找到含有11位数字的字符串段full_text = self.fullTextphone   = ""number  = ""number_List = []# 通过关键词查找for line in full_text.split("\n"):if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):# 去除标点符号line = re.sub(r"[()()::+\-]", "", line)# 选择 11 到 15 位长度的数字number_List = re.findall(r"\d{11,15}", line)if (len(number_List) > 0): number = number_List[0]return number                    break# 直接通过数字长度查找 返回符合要求的集合if phone == "":text   = re.sub(r"[()()+\-]", "", full_text)phones = re.findall(r"\d{11,15}", text)phone  = ",".join(set(phones))return phone# 06 确认来源信息def __search_Vendor (self):directory = self.file_dirfull_text = self.fullText# 在目录中寻找for vendor in Vendor_List:if vendor in self.file_dir:return vendor# 在字段中寻找count = 0for line in full_text.split("\n"):if (count > 20): breakfor vendor in Vendor_List:if vendor in line: return vendorcount += 1return ""# 07 搜索性别函数  Gender 没写男女就只能通过照片去判断def __search_Gender (self):gender = "" full_text  = self.fullTextfull_words  = self.fullWordcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 15): break# 性别字段if re.search(r"性[ ]+别*", line):gender = re.findall(r"性[ ]+别[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0]# 识别到男性字段if re.search(r"男", line) or re.search(r"Male", line): gender = "男"return gender# 识别到女性字段if re.search(r"女", line) or re.search(r"Female", line): gender = "女"return gendercounter += 1                                return gender# 可选部分:# 08 搜索年龄函数def __search_Age (self):Curr_Year = datetime.datetime.now().yearnumber = ""full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):# 获取出生年月if re.search(r"出生年月", line):number_List = re.findall(r"\d{4,4}", line)if (len(number_List) > 0): number = number_List[0]Age = Curr_Year - int(number)return str(Age) break# 获取岁if re.search(r"\s*岁", line):number_List = re.findall(r"\d{1,2}", line)if (len(number_List) > 0): number = number_List[0]return number  breakreturn ""# 09 判断在职状态def __search_Cond (self):full_text  = self.fullTextcounter = 0# 在专业字段中寻找for line in full_text.split("\n"):        # 限制第十五行以内if (counter > 20): breakif re.search(r"离职", line): return "离职"if re.search(r"正在找工作", line): return "正在找工作"if re.search(r"在职", line): return "在职"counter += 1                                return ""# 10 搜索城市函数def __search_City (self):locations = []location = ""  full_text  = self.fullText# 在地点字段中寻找for line in full_text.split("\n"):if re.search(r"\s*地点", line):loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地点::\s]", "", location))breakif re.search(r"所在地", line) or re.search(r"现居地", line):loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[地::\s]", "", location))breakif re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):# 住址loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[住址::\s]", "", location))break# 现居住loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[现居住::\s]", "", location))break# Locationloc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)if (len(loc_List) > 0): location = loc_List[0]locations.append(re.sub(r"[Location::\s]", "", location))breakbreakif (len(locations) > 0): location = locations[0]return location# 11 搜索学历函数def __search_Stage (self):stage = ""  full_text  = self.fullTextsetPhd = ["博士"]setMsc = ["硕士", "研究生"]setBsc = ["大学", "本科"]setByd = ["大专", "专科"]setOth = ["学院"]setSta = setPhd + setMsc + setBsc + setByd + setOth# 在学历字段中寻找for line in full_text.split("\n"):if (any (TempStr in line for TempStr in setSta)):if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"if stage != "": return stageif stage == "": return "专科"                    return stage# 12 搜索籍贯函数def __search_Hometown (self):return ""# 13 搜索# 14 搜索# 15 搜索# 16 搜索# 17 搜索# 18 搜索# 19 搜索# 20 搜索# 21 搜索# 22 搜索邮箱函数  Emaildef __search_Email (self):# 找到含有 @ 和 . 的字符串段full_words  = self.fullWordfull_text  = self.fullTextemail = ""email_List = []newEmail = ""# 先查看邮箱栏下是否有邮箱可以直接选用for line in full_text.split("\n"):if re.search(r"邮[ ]+箱", line):newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))if (len(email_List) > 0):for TempEmail in email_List:if '@' in TempEmail:email = email_List[0] return email# 再遍历所有的 word 寻找邮箱特殊的关键词for word in full_words:if os.path.splitext(self.file_dir)[1] == ".pdf":text = word["text"]else:text = wordif "@" in text and "." in text:for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):if "@" in e:email = ebreakif email != "": breakreturn email# 搜索技能函数def __search_Skill (self):Skills = []skill  = ""full_text  = self.fullTextfor line in full_text.split("\n"):key = ""for keyword in Skillset_List:            if re.search(keyword, line) and (key == ""):Skills.append(line)key = "Added"return Skills# 入口函数 返回搜索结果def search (self):# 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个sep_dir = re.split(r"/+|\\+", self.file_dir)directory = sep_dir[-2]file_name = sep_dir[-1]info = {"directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": ""}# 下面的一大段之后想写一个函数替代# 查找姓名try:info["user_name"] = self.__search_Name()except Exception as e: print("User_Name: " + e)# 查找 Emailtry:info["email"] = self.__search_Email()except Exception as e: print("Email: " + e)# 查找 Phonetry:info["phone"] = self.__search_Phone()             # 无奈之举选择前 11 位 之后需要做实验和讨论except Exception as e: print("Phone: " + e)# 查找 Majortry:info["major"] = self.__search_Major()except Exception as e: print("Major: " + e)# 查找 Gendertry:info["gender"] = self.__search_Gender()except Exception as e: print("Gender :" + e)# 查找 Stagetry:info["stage"] = self.__search_Stage()except Exception as e: print("Stage: " + e)# 查找 Citytry:info["city"] = self.__search_City()except Exception as e: print("City: " + e)# 查找 Agetry:info["age"] = self.__search_Age()except Exception as e: print("Age: " + e)# 查找 Skillstry:info["skill"] = self.__search_Skill()except Exception as e: print("Skill: " + e)# 查找 Jobstry:info["jobs"] = self.__search_Jobs()except Exception as e: print("Jobs: " + e)# 查找 Vendortry:info["vendor"] = self.__search_Vendor()except Exception as e: print("Vendor: " + e)#return info################################################################################################
# 猎聘
# class Liepin (object):################################################################################################
# 智联
# class Zhilian (object):################################################################################################
# 前程无忧
# class Qiancheng (object):################################################################################################
# 51jobs
# class Jobs (object):################################################################################################
# 遍历并读取函数
class Reader (object):# 初始化def __init__ (self, folder_Path):self.path = folder_Path# 遍历文件夹内所有的文件, type是一段字符串 标注文件类型def read (self, type):ResumePath = []allfilelist = os.listdir(self.path)for file in allfilelist:# 生成简历文件路径 判断是否位文件filepath = os.path.join(FolderPath, file)if os.path.isfile(filepath):# 遍历所有符合type类型的简历if (filepath.find(type) != -1) and (filepath.find("$") == -1):ResumePath.append(filepath)filename.append(file)return ResumePath################################################################################################
# 输出生成函数
class Generator (object):# 初始化def __init__ (self, sourceInfo):self.info = sourceInfo# 打印呈现def display (self):result = self.infoprint("################### Candidate ", counter, " ###################")# Necessary infoprint("Name     : ", result["user_name"])print("Position :", result["jobs"])print("Major    : ", result["major"])print("Phone    : ", result["phone"])print("Gender   : ", result["gender"])print("Source   : ", result["file_name"])print("Vendor   : ", result["vendor"])print("Condition: ", result["condition"])# Optional Infoprint("Email    : ", result["email"])print("City     : ", result["city"])print("Age      : ", result["age"])print("Stage    : ", result["stage"])  # print("SkillSet : ", "\n".join(result["skill"]))print("\n\n\n")# 生成 Jsondef generate_Json (self):try:data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)except Exception as e: print(e)return data_Json################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):# 初始化def __init__ (self, sourceJson):self.source = sourceJson# 主要函数def process (self):sourceFile = "TestJson.json"# 链接 api 接口from pyDataverse.api import NativeApiapi = NativeApi(BASE_URL, API_TOKEN)# Create Collection of datafrom pyDataverse.models import Dataversefrom pyDataverse.utils import read_filedv = Dataverse()dv.from_json(read_file(sourceFile))resp = api.create_dataverse (":root", dv.json())resp = api.publish_dataverse ("Dataverse_Resumes")resp = api.get_dataverse ("Dataverse_Resumes")################################################################################################
# 主函数
if __name__ == "__main__":# Step 1: 遍历该文件夹下的所有简历文件PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")# Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内counter = 0Json_filename = "resume_Result.json"Json_file = open(Json_filename, 'w', encoding = 'utf-8')Json_file.seek(0)       # 定位到 Position 0Json_file.truncate()    # 清空 Json 文件for file in PdfResumePath:# 导出字典下的内容counter = counter + 1# if (counter < 6): continueinfo = Extractor(file_dir = file).search()# 呈现内容并写入 Json 文件内Generator(sourceInfo = info).display()Result_Json = Generator(sourceInfo = info).generate_Json()Json_file.write(Result_Json + "\n")Json_file.close()# 复制到仓库中# Step 3: 导出到 dataverse# dataverse_Publish(sourceJson = Json_filename).process()# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2################################################################################################
# 函数 读取信息
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)# Test for pyDataversefrom pyDataverse.api    import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse
from pyDataverse.models import Datafile
from pyDataverse.models import Dataset
from pyDataverse.utils  import read_file
from pyDataverse.api    import NativeApi# Input: the website of Dataverse and Token
BASE_URL  = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"
API_TOKEN = ""class pyDataFunc (object):# 初始化def __init__ (self):self.baseURL  = BASE_URLself.apiTOKEN = API_TOKEN# 连接api并尝试访问 返回 api()def getAPI (self):# 要求输入两个字符串assert (isinstance(self.baseURL, str) and isinstance(self.apiTOKEN, str))api  = NativeApi(self.baseURL, self.apiTOKEN)response = api.get_info_version()print (response)return response# 配置数据空间 Dataverse Collection 返回 dataverse()def setDataverse (self, filename):# 要求输入是一个 json 文件assert(isinstance(filename, str) and (".json" in filename))dv = Dataverse()dv.from_json(read_file(filename))return dv# 配置数据集 Dataset  返回 dataset()def setDataset (self, filename):# 要求输入是一个 json 文件assert(isinstance(filename, str) and (".json" in filename))ds = Dataset()ds.from_json(read_file(filename))return ds# 创建数据空间 Dataverse 返回创建结果def createDataverse (self, dv, verseLoc, verseName):# setNameassert(isinstance(verseName, str))response = api.create_dataverse (verseLoc, dv.json())print(response1)return response# 创建数据集 Dataset 返回创建结果def createDataset (self, ds, setName):# setNameassert(ds.validate_json() and isinstance(setName, str))response = api.create_dataset(setName, ds.json())return response# 获取 pid 返回 pid 结果def getPID (self, resp):ds_pid = resp.json()["data"]["persistentId"]return ds_pid# 上传数据文件def uploadDatafile (self, filename, pid):# 要求字符串assert(isinstance(filename, str))df = Datafile()df.set({'pid': pid, 'filename': filename})response = api.upload_datafile(pid, filename, df.json())print(response.json())#############################################################################################if __name__ == "__main__":filename = ""ds  = pyDataFunc.setDataset (filename)rs1 = pyDataFunc.createDataset (ds, "A set name")pid = pyDataFunc.getPID (rs)rs2 = pyDataFunc.uploadDatafile(filename, pid)# 输出结果 .get()["<term>"]# 验证是不是合格的json文件 validate_json()from selenium import webdriver#######################################################################################
# Deploy Driver
class function (object):def __init__ (self, inputurl):# 配置浏览器self.url = inputurlself.driver = webdriver.Chrome(r"C:/Alan .AIA/Python/Driver/chromedriver.exe")def getBing (self):self.driver.get("www.bing.com")#######################################################################################
# Main Function
if __name__ == "__main__":function(inputurl = "www.bing.com").getBing()# -*- coding:utf-8 -*-
# Author: juzstu
# Time: 2019/8/22 0:31import pandas as pd
import numpy as np
import jieba as jb
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import warnings
from tqdm import tqdm
from joblib import Parallel, delayedwarnings.filterwarnings('ignore')def modified_jd_df(jd_path):tmp_list = []tmp_file = open(jd_path, encoding='utf8')for i, j in enumerate(tmp_file.readlines()):if i == 175425:j = j.replace('销售\t|置业顾问\t|营销', '销售|置业顾问|营销')tmp = j.split('\t')tmp_list.append(tmp)tmp_file.close()return pd.DataFrame(tmp_list[1:], columns=tmp_list[0])def get_min_salary(x):if len(x) == 12:return int(x[:6])elif len(x) == 10:return int(x[:5])elif len(x) == 11:return int(x[:5])elif len(x) == 9:return int(x[:4])else:return -1def get_max_salary(x):if len(x) == 12:return int(x[6:])elif len(x) == 10:return int(x[5:])elif len(x) == 11:return int(x[5:])elif len(x) == 9:return int(x[4:])else:return -1def is_same_user_city(df):live_city_id = str(df['live_city_id'])desire_jd_city = df['desire_jd_city_id']return live_city_id in desire_jd_citydef jieba_cnt(df):experience = df['experience']jd_title = df['jd_title']jd_sub_type = df['jd_sub_type']if isinstance(experience, str) and isinstance(jd_sub_type, str):tmp_set = set(jb.cut_for_search(jd_title)) | set(jb.cut_for_search(jd_sub_type))experience = set(jb.cut_for_search(experience))tmp_cnt = 0for t in tmp_set:if t in experience:tmp_cnt += 1return tmp_cntelse:return 0def cur_industry_in_desire(df):cur_industry_id = df['cur_industry_id']desire_jd_industry_id = df['desire_jd_industry_id']if isinstance(cur_industry_id, str) and isinstance(desire_jd_industry_id, str):return cur_industry_id in desire_jd_industry_idelse:return -1def desire_in_jd(df):desire_jd_type_id = df['desire_jd_type_id']jd_sub_type = df['jd_sub_type']if isinstance(jd_sub_type, str) and isinstance(desire_jd_type_id, str):return jd_sub_type in desire_jd_type_idelse:return -1def get_tfidf(df, names, merge_id):tfidf_enc_tmp = TfidfVectorizer(ngram_range=(1, 2))tfidf_vec_tmp = tfidf_enc_tmp.fit_transform(df[names])svd_tag_tmp = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)tag_svd_tmp = svd_tag_tmp.fit_transform(tfidf_vec_tmp)tag_svd_tmp = pd.DataFrame(tag_svd_tmp)tag_svd_tmp.columns = [f'{names}_svd_{i}' for i in range(10)]return pd.concat([df[[merge_id]], tag_svd_tmp], axis=1)def get_str(x):return ' '.join([i for i in jb.cut(x) if i not in stop_words])def offline_eval_map(train_df, label, pred_col):tmp_train = train_df.copy()tmp_train['rank'] = tmp_train.groupby('user_id')[pred_col].rank(ascending=False, method='first')tmp_x = tmp_train[tmp_train[label] == 1]tmp_x[f'{label}_index'] = tmp_x.groupby('user_id')['rank'].rank(ascending=True, method='first')tmp_x['score'] = tmp_x[f'{label}_index'] / tmp_train['rank']return  tmp_x.groupby('user_id')['score'].mean().mean()def sub_on_line(train_, test_, pred, label, cate_cols, is_shuffle=True, use_cate=True):print(f'data shape:\ntrain--{train_.shape}\ntest--{test_.shape}')n_splits = 5folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=1024)sub_preds = np.zeros((test_.shape[0], folds.n_splits))train_[f'{label}_pred'] = 0fold_importance_df = pd.DataFrame()fold_importance_df["Feature"] = predprint(f'Use{len(pred)}features ...')auc_scores = []params = {'learning_rate': 0.01,'boosting_type': 'gbdt','objective': 'binary','metric': 'auc','num_leaves': 63,'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 5,'seed': 1,'bagging_seed': 1,'feature_fraction_seed': 7,'min_data_in_leaf': 20,'nthread': -1,'verbose': -1}train_user_id = train_['user_id'].unique()for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):print(f'the{n_fold}training start ...')train_x, train_y = train_.loc[train_['user_id'].isin(train_user_id[train_idx]), pred], train_.loc[train_['user_id'].isin(train_user_id[train_idx]), label]valid_x, valid_y = train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), pred], train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), label]print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')if use_cate:dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)else:dtrain = lgb.Dataset(train_x, label=train_y)dvalid = lgb.Dataset(valid_x, label=valid_y)clf = lgb.train(params=params,train_set=dtrain,num_boost_round=10000,valid_sets=[dvalid],early_stopping_rounds=100,verbose_eval=100)sub_preds[:, n_fold - 1] = clf.predict(test_[pred], num_iteration=clf.best_iteration)auc_scores.append(clf.best_score['valid_0']['auc'])fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance()train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), f'{label}_pred'] = \clf.predict(valid_x, num_iteration=clf.best_iteration)five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)fold_importance_df[['Feature', 'avg_imp']].to_csv('feat_imp_base.csv', index=False, encoding='utf8')test_[label] = np.mean(sub_preds, axis=1)print('auc score', np.mean(auc_scores))return test_[['user_id', 'jd_no', label]], train_[['user_id', 'jd_no', f'{label}_pred', label]]#############################################################################################################
# MAIN FUNCTIONif __name__ == "__main__":min_work_year = {103: 1, 305: 3, 510: 5, 1099: 10}max_work_year = {103: 3, 305: 5, 510: 10}degree_map = {'其他': 0, '初中': 1, '中技': 2, '中专': 2, '高中': 2, '大专': 3, '本科': 4,'硕士': 5, 'MBA': 5, 'EMBA': 5, '博士': 6}sub_path = './submit/'train_data_path = './zhaopin_round1_train_20190716/'test_data_path = './zhaopin_round1_test_20190716/'train_user = pd.read_csv(train_data_path + 'table1_user', sep='\t')train_user['desire_jd_city_id'] = train_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))train_user['desire_jd_salary_id'] = train_user['desire_jd_salary_id'].astype(str)train_user['min_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_min_salary)train_user['max_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_max_salary)train_user['min_cur_salary'] = train_user['cur_salary_id'].apply(get_min_salary)train_user['max_cur_salary'] = train_user['cur_salary_id'].apply(get_max_salary)train_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)train_jd = pd.read_csv(train_data_path + 'table2_jd.csv', sep='\t')train_jd.drop(['company_name', 'max_edu_level', 'is_mangerial', 'resume_language_required'], axis=1, inplace=True)train_jd['min_work_year'] = train_jd['min_years'].map(min_work_year)train_jd['max_work_year'] = train_jd['min_years'].map(max_work_year)train_jd['start_date'].replace(r'\N', '22000101', inplace=True)train_jd['end_date'].replace(r'\N', '22000101', inplace=True)train_jd['start_date'] = pd.to_datetime(train_jd['start_date'].astype(str).apply(lambda x:f'{x[:4]}-{x[4:6]}-{x[6:]}'))train_jd['end_date'] = pd.to_datetime(train_jd['end_date'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:]}'))train_jd.loc[train_jd['end_date'] == '2200-01-01', ['start_date', 'end_date']] = np.nanstop_words = [i.strip() for i in open('中文停用词表.txt', 'r', encoding='utf8').readlines()]stop_words.extend(['\n', '\xa0', '\u3000', '\u2002'])tmp_cut = Parallel(n_jobs=-1)(delayed(get_str)(train_jd.loc[ind]['job_description\n'])for ind in tqdm(train_jd.index))tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))tfidf_vec = tfidf_enc.fit_transform(tmp_cut)svd_tag = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)tag_svd = svd_tag.fit_transform(tfidf_vec)tag_svd = pd.DataFrame(tag_svd)tag_svd.columns = [f'desc_svd_{i}' for i in range(10)]train_jd = pd.concat([train_jd, tag_svd], axis=1)train_action = pd.read_csv(train_data_path + 'table3_action', sep='\t')train_action['user_jd_cnt'] = train_action.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').valuestrain_action['jd_cnt'] = train_action.groupby(['user_id'])['jd_no'].transform('count').valuestrain_action['jd_nunique'] = train_action.groupby(['user_id'])['jd_no'].transform('nunique').valuestrain_action = train_action.drop_duplicates()train_action.sort_values(['user_id', 'jd_no', 'delivered', 'satisfied'], inplace=True)train_action = train_action.drop_duplicates(subset=['user_id', 'jd_no'], keep='last')train_action = train_action[train_action['jd_no'].isin(train_jd['jd_no'].unique())]train = train_action.merge(train_user, on='user_id', how='left')train = train.merge(train_jd, on='jd_no', how='left')del train['browsed']print('train data base feats already generated ...')test_user = pd.read_csv(test_data_path + 'user_ToBePredicted', sep='\t')test_user['desire_jd_city_id'] = test_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))test_user['desire_jd_salary_id'] = test_user['desire_jd_salary_id'].astype(str)test_user['min_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_min_salary)test_user['max_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_max_salary)test_user['min_cur_salary'] = test_user['cur_salary_id'].apply(get_min_salary)test_user['max_cur_salary'] = test_user['cur_salary_id'].apply(get_max_salary)test_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)test = pd.read_csv(test_data_path + 'zhaopin_round1_user_exposure_B_20190819', sep=' ')test['user_jd_cnt'] = test.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').valuestest['jd_cnt'] = test.groupby(['user_id'])['jd_no'].transform('count').valuestest['jd_nunique'] = test.groupby(['user_id'])['jd_no'].transform('nunique').valuestest = test.drop_duplicates()test['delivered'] = -1test['satisfied'] = -1test = test.merge(test_user, on='user_id', how='left')test = test.merge(train_jd, on='jd_no', how='left')print('test data base feats already generated ...')all_data = train.append(test, sort=False)all_data['jd_user_cnt'] = all_data.groupby(['jd_no'])['user_id'].transform('count').valuesall_data['same_user_city'] = all_data.apply(is_same_user_city, axis=1).astype(int)all_data['city'].fillna(-1, inplace=True)all_data['city'] = all_data['city'].astype(int)all_data['same_com_live'] = (all_data['city'] == all_data['live_city_id']).astype(int)all_data['min_edu_level'] = all_data['min_edu_level'].apply(lambda x: x.strip() if isinstance(x, str) else x)all_data['cur_degree_id'] = all_data['cur_degree_id'].apply(lambda x: x.strip() if isinstance(x, str) else x)all_data['min_edu_level_num'] = all_data['min_edu_level'].map(degree_map)all_data['cur_degree_id_num'] = all_data['cur_degree_id'].map(degree_map)all_data['same_edu'] = (all_data['min_edu_level'] == all_data['cur_degree_id']).astype(int)all_data['gt_edu'] = (all_data['cur_degree_id_num'] >= all_data['min_edu_level_num']).astype(int)all_data['min_desire_salary_num'] = (all_data['min_desire_salary'] <= all_data['min_salary']).astype(int)all_data['min_cur_salary_num'] = (all_data['min_cur_salary'] <= all_data['min_salary']).astype(int)all_data['max_desire_salary_num'] = (all_data['max_desire_salary'] <= all_data['max_salary']).astype(int)all_data['max_cur_salary_num'] = (all_data['max_cur_salary'] <= all_data['max_salary']).astype(int)all_data['same_desire_industry'] = all_data.apply(cur_industry_in_desire, axis=1).astype(int)all_data['same_jd_sub'] = all_data.apply(desire_in_jd, axis=1).astype(int)all_data['start_month'] = all_data['start_date'].dt.monthall_data['start_day'] = all_data['start_date'].dt.dayall_data['end_month'] = all_data['start_date'].dt.monthall_data['end_day'] = all_data['start_date'].dt.dayall_data['jd_days'] = (all_data['end_date'] - all_data['start_date']).dt.daysall_data['user_work_year'] = 2019 - all_data['start_work_date'].replace('-', np.nan).astype(float)all_data['gt_min_year'] = (all_data['user_work_year'] > all_data['min_work_year']).astype(int)all_data['gt_max_year'] = (all_data['user_work_year'] > all_data['max_work_year']).astype(int)all_data['len_experience'] = all_data['experience'].apply(lambda x: len(x.split('|')) if isinstance(x, str) else np.nan)all_data['desire_jd_industry_id_len'] = all_data['desire_jd_industry_id'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)all_data['desire_jd_type_id_len'] = all_data['desire_jd_type_id'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)all_data['eff_exp_cnt'] = all_data.apply(jieba_cnt, axis=1)all_data['eff_exp_ratio'] = all_data['eff_exp_cnt'] / all_data['len_experience']all_data.drop(['cur_degree_id_num', 'cur_degree_id', 'desire_jd_city_id', 'min_years','start_work_date', 'start_date', 'end_date', 'key', 'min_edu_level'], axis=1, inplace=True)# 城市统计all_data['user_jd_city_nunique'] = all_data.groupby('user_id')['city'].transform('nunique').valuesall_data['jd_user_city_nunique'] = all_data.groupby('jd_no')['live_city_id'].transform('nunique').valuesall_data['jd_title_nunique'] = all_data.groupby('user_id')['jd_title'].transform('nunique').valuesall_data['jd_sub_type_nunique'] = all_data.groupby('user_id')['jd_sub_type'].transform('nunique').valuesall_data['user_desire_jd_industry_id_nunique'] = all_data.groupby('jd_no')['desire_jd_industry_id'].transform('nunique').valuesall_data['user_desire_jd_type_id_nunique'] = all_data.groupby('jd_no')['desire_jd_type_id'].transform('nunique').values# 薪资all_data['user_jd_min_salary_min'] = all_data.groupby('user_id')['min_salary'].transform('min').valuesall_data['user_jd_min_salary_max'] = all_data.groupby('user_id')['min_salary'].transform('max').valuesall_data['user_jd_min_salary_mean'] = all_data.groupby('user_id')['min_salary'].transform('mean').valuesall_data['user_jd_min_salary_std'] = all_data.groupby('user_id')['min_salary'].transform('std').valuesall_data['user_jd_max_salary_min'] = all_data.groupby('user_id')['max_salary'].transform('min').valuesall_data['user_jd_max_salary_max'] = all_data.groupby('user_id')['max_salary'].transform('max').valuesall_data['user_jd_max_salary_mean'] = all_data.groupby('user_id')['max_salary'].transform('mean').valuesall_data['user_jd_max_salary_std'] = all_data.groupby('user_id')['max_salary'].transform('std').valuesall_data['jd_user_desire_min_salary_min'] = all_data.groupby('jd_no')['min_desire_salary'].transform('min').valuesall_data['jd_user_desire_min_salary_max'] = all_data.groupby('jd_no')['min_desire_salary'].transform('max').valuesall_data['jd_user_desire_min_salary_mean'] = all_data.groupby('jd_no')['min_desire_salary'].transform('mean').valuesall_data['jd_user_desire_min_salary_std'] = all_data.groupby('jd_no')['min_desire_salary'].transform('std').valuesall_data['jd_user_desire_max_salary_min'] = all_data.groupby('jd_no')['max_desire_salary'].transform('min').valuesall_data['jd_user_desire_max_salary_max'] = all_data.groupby('jd_no')['max_desire_salary'].transform('max').valuesall_data['jd_user_desire_max_salary_mean'] = all_data.groupby('jd_no')['max_desire_salary'].transform('mean').valuesall_data['jd_user_desire_max_salary_std'] = all_data.groupby('jd_no')['max_desire_salary'].transform('std').valuesall_data['jd_days_min'] = all_data.groupby('user_id')['jd_days'].transform('min').valuesall_data['jd_days_max'] = all_data.groupby('user_id')['jd_days'].transform('max').valuesall_data['jd_days_mean'] = all_data.groupby('user_id')['jd_days'].transform('mean').valuesall_data['jd_days_std'] = all_data.groupby('user_id')['jd_days'].transform('std').valuesall_data['jd_days_skew'] = all_data.groupby('user_id')['jd_days'].transform('skew').valuesall_data['age_min'] = all_data.groupby('jd_no')['birthday'].transform('min').valuesall_data['age_max'] = all_data.groupby('jd_no')['birthday'].transform('max').valuesall_data['age_mean'] = all_data.groupby('jd_no')['birthday'].transform('mean').valuesall_data['age_std'] = all_data.groupby('jd_no')['birthday'].transform('std').valuesall_data['age_skew'] = all_data.groupby('jd_no')['birthday'].transform('skew').valuesfor j in ['jd_title', 'jd_sub_type']:le = LabelEncoder()all_data[j].fillna('nan', inplace=True)all_data[f'{j}_map_num'] = le.fit_transform(all_data[j])all_data['experience'] = all_data['experience'].apply(lambda x: ' '.join(x.split('|') ifisinstance(x, str) else 'nan'))exp_gp = all_data.groupby('jd_no')['experience'].agg(lambda x: ' '.join(x.to_list())).reset_index()exp_gp = get_tfidf(exp_gp, 'experience', 'jd_no')all_data = all_data.merge(exp_gp, on='jd_no', how='left')use_feats = [c for c in all_data.columns if c not in ['user_id', 'jd_no', 'delivered', 'satisfied'] +['desire_jd_industry_id', 'desire_jd_type_id', 'cur_industry_id', 'cur_jd_type', 'experience','jd_title', 'jd_sub_type', 'job_description\n']]sub_sat, train_pred_sat = sub_on_line(all_data[all_data['satisfied'] != -1], all_data[all_data['satisfied'] == -1],use_feats, 'satisfied', ['live_city_id', 'city'], use_cate=True)sub_dev, train_pred_dev = sub_on_line(all_data[all_data['delivered'] != -1], all_data[all_data['delivered'] == -1],use_feats, 'delivered', ['live_city_id', 'city'], use_cate=True)train_pred_sat['merge_pred'] = train_pred_sat['satisfied_pred'] * 0.8 + train_pred_dev['delivered_pred'] * 0.2sub_sat['merge_prob'] = sub_sat['satisfied'] * 0.8 + sub_dev['delivered'] * 0.2train_pred_sat = train_pred_sat.merge(all_data[all_data['delivered'] != -1][['user_id', 'jd_no', 'delivered']],on=['user_id', 'jd_no'], how='left')dev_map = offline_eval_map(train_pred_sat, 'delivered', 'merge_pred')sat_map = offline_eval_map(train_pred_sat, 'satisfied', 'merge_pred')print('dev map:', round(dev_map, 4), 'sat map:', round(sat_map, 4), 'final score:',round(0.7 * sat_map + 0.3 * dev_map, 4))sub_df = pd.DataFrame(columns=['user_id', 'jd_no', 'merge_prob'])for i in sub_sat['user_id'].unique():tmp_sub = sub_sat[(sub_sat['user_id'] == i) &(sub_sat['jd_no'].isin(train_jd['jd_no']))].sort_values('merge_prob', ascending=False)[['user_id', 'jd_no', 'merge_prob']]sub_df = sub_df.append(tmp_sub)sub_df = sub_df.append(sub_sat[(sub_sat['user_id'] == i) & (~sub_sat['jd_no'].isin(train_jd['jd_no']))][['user_id', 'jd_no', 'merge_rank']])sub_df[['user_id', 'jd_no']].to_csv('sub_base.csv', index=False)import pandas as pd
import numpy  as np
import jieba
import json# This file Asks Json in form [{},{},...,{}]# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Automation\\resume_Result.json"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Automation\\"# Sub Functions
def inputSource (sourcePath):filename = sourcePath.split("\\")[-1]if   (".json" in filename):targetDS  = pd.read_json(sourcePath, encoding = "utf-8")print(targetDS)targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")elif (".csv" in filename):targetDS = pd.read_csv(sourcePath, encoding = "utf-8")return targetDS# Main Function
if __name__ == "__main__":target = inputSource(sourcePath = FILEPATH)
// C#/MSALusing Microsoft.Identity.Client;
using Newtonsoft.Json.Linq;
using System;
using System.Net.Http;
using System.Net.Http.Headers;namespace PowerApps.Samples
{class Program{static void Main(){// TODO Specify the Dataverse environment name to connect with.string resource = "https://<env-name>.api.<region>.dynamics.com";// Azure Active Directory app registration shared by all Power App samples.// For your custom apps, you will need to register them with Azure AD yourself.// See https://docs.microsoft.com/powerapps/developer/data-platform/walkthrough-register-app-azure-active-directoryvar clientId = "51f81489-12ee-4a9e-aaae-a2591f45987d";var redirectUri = "app://58145B91-0C36-4500-8554-080854F2AC97";#region Authenticationvar authBuilder = PublicClientApplicationBuilder.Create(clientId).WithAuthority(AadAuthorityAudience.AzureAdMultipleOrgs).WithRedirectUri(redirectUri).Build();var scope = resource + "/.default";string[] scopes = { scope };AuthenticationResult token = authBuilder.AcquireTokenInteractive(scopes).ExecuteAsync().Result;#endregion Authentication#region Client configurationvar client = new HttpClient{// See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors#web-api-url-and-versionsBaseAddress = new Uri(resource + "/api/data/v9.2/"),Timeout = new TimeSpan(0, 2, 0)    // Standard two minute timeout on web service calls.};// Default headers for each Web API call.// See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors#http-headersHttpRequestHeaders headers = client.DefaultRequestHeaders;headers.Authorization = new AuthenticationHeaderValue("Bearer", token.AccessToken);headers.Add("OData-MaxVersion", "4.0");headers.Add("OData-Version", "4.0");headers.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));#endregion Client configuration#region Web API call// Invoke the Web API 'WhoAmI' unbound function.// See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors// See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/use-web-api-functions#unbound-functionsvar response = client.GetAsync("WhoAmI").Result;if (response.IsSuccessStatusCode){// Parse the JSON formatted service response to obtain the user ID.JObject body = JObject.Parse(response.Content.ReadAsStringAsync().Result);Guid userId = (Guid)body["UserId"];Console.WriteLine("Your user ID is {0}", userId);}else{Console.WriteLine("Web API call failed");Console.WriteLine("Reason: " + response.ReasonPhrase);}#endregion Web API call// Pause program execution by waiting for a key press.Console.ReadKey();}}
}

NLP Python相关推荐

  1. python kmean 多维_绘制多维Kmeans集群NLP python

    我有一个为NLP分类器设计的多维向量.在 以下是数据帧(文本框): 我使用TfidVectorizer创建向量:from sklearn.feature_extraction.text import ...

  2. 关于深度学习、NLP和计算机视觉的30个顶级Python库

    双语原文链接:Top Python Libraries for Deep Learning, Natural Language Processing & Computer Vision 请注意 ...

  3. ​关于深度学习、NLP和计算机视觉的30个顶级Python库

    正文字数:2214  阅读时长:3分钟 再次感谢艾哈迈德·阿尼斯(Ahmed Anis)为收集这些数据做出的贡献,并感谢KDnuggets的其他工作人员的意见,见解和建议. 作者 / Matthew ...

  4. python之torchlight使用_关于深度学习、NLP和计算机视觉的30个顶级Python库

    再次感谢艾哈迈德·阿尼斯(Ahmed Anis)为收集这些数据做出的贡献,并感谢KDnuggets的其他工作人员的意见,见解和建议. 作者 / Matthew Mayo 请注意,下面是由Gregory ...

  5. 独家 | 关于NLP和机器学习之文本处理的你需要知道的一切(附学习资源)

    作者:Kavita Ganesan 翻译:陈雨琳 校对:丁楠雅 本文4800字,建议阅读20分钟. 本文将介绍自然语言处理和机器学习中常见的文本预处理方法. 标签: 数据处理 https://www. ...

  6. [Python人工智能] 一.TensorFlow环境搭建及神经网络入门

    从本篇文章开始,作者正式开始研究Python深度学习.神经网络及人工智能相关知识.第一篇文章主要讲解神经网络基础概念,同时讲解TensorFlow2.0的安装过程及基础用法,主要结合作者之前的博客和& ...

  7. nlp自然语言处理_自然语言处理(NLP):不要重新发明轮子

    nlp自然语言处理 介绍 (Introduction) Natural language processing (NLP) is an intimidating name for an intimid ...

  8. 创建python的虚拟环境(图文教程),并使用。

    创建python的虚拟环境 第一步:打开anaconda命令行,即打开anaconda prompt. base的主环境切换到虚拟环境nlp (base) C:\Users\user>conda ...

  9. python和nltk自然语言处理书评_python自然语言处理_自然语言处理入门

    说明:本文是<Python数据分析与数据化运营>中的"3.12.4 自然语言文本预处理".下面是正文内容-与数据库 本文从概念和实际操作量方面,从零开始,介绍在Pyth ...

最新文章

  1. 新盒模型移动端的排版
  2. 源码 移植_FreeModbus移植总结
  3. 不安装cudnn可不可以_Linux非root用户如何优雅的安装cuda和cudnn
  4. bzoj2375 疯狂的涂色
  5. python读取多行json_如何在Python中读取包含多个JSON对象的JSON文件?
  6. c语言银行系统个人心得,c,,银行管理实验报告心得体会.docx
  7. 微信小程序 data中数据值的更改与储存
  8. elementui中下拉菜单需要传入多个参数的处理
  9. ASP.NET 订餐系统-程序+配置文档
  10. Win10卸载微软sql服务器,卸载 SQL Server Management Studio
  11. 关于安装Turn服务器:coturn,TurnServer,Restund,reTurnServer
  12. Navicat Premium15安装与激活(完整激活版)
  13. linux开发板推荐
  14. 投票和排名系统C语言顺序表,电视大赛观众投票及排名系统C语言设计.docx
  15. word排版案例报告_看完这4个文章排版要点,你就会排版啦!
  16. 职高计算机专业毕业200字自我鉴定,职高计算机专业的自我鉴定
  17. 高斯日记python实现
  18. 小白起步-大数据环境搭建-Java环境配置
  19. 高数中dy和Δy有什么区别
  20. Google浏览器在新标签页打开网址设置

热门文章

  1. 四川计算机专业的二本大学排名及分数线,2019-2020四川二本大学排名及分数线(理科+文科)...
  2. qbo 点云数据(距离)的获取
  3. 你想要拥有自己的搜索引擎吗?
  4. python爬虫: 爬取拉勾网职位并分析
  5. 如何成为一个牛逼的程序猿
  6. 如何去掉a标签的下划线 各种样式详解
  7. python3 中解决\u8bf7\u6c42\u6210\u529f“格式编码问题
  8. c语言非法标识符是什么意思,123H为什么是非法标识符
  9. 【python】简单使用selenium编写无界面谷歌浏览器的网页登录和签到功能
  10. java程序打包一体化:代码-jar-exe-安装包(图文详解、资源提供)