progName="GedExplore_v05.py" #by David@ColeCanada.com #on 2023ISep19 #Purpose to explore a GEDCOM # by # 1 loading the keys into a Python dictionary # 2 creating a ged family database # putting all info into a Python Dictionary # using unique keys # 3 display the metrics # v03 will use "try" when reading the ged file # # TODO # 0. validate FileName # ignore records preceding "1 SCHEMA" # 1. get key between @ and @ # 2. parse the NAME correctly, allowing for mistakes # given/surname/title # both given & surname must not be null # 3. find refs to missing INDI # 4. find FAMs with no parent(s) # 5. during second pass # ensure FAMS match and FAMC match # count INDIs and FAMs # gather BIRT, DEAT, MARR, DATE, PLAC, SEX, # ensure that each record has been used # Goal: see GEDExplore_plan.txt # Issue 1 Encountered utf-8 error in "ColesOfDevonDeLeg15L10.ged" # after record 674 # So I used "lastGood=674" # when lastGood==10 it will list all recN # but when I copied bad file to "ColesOfDevo.ged", it worked fine # so this must be a data record in a file stored on microsd. Doh! # I believe that these 2 files "should be" identical. Doh! print(progName) #### Control Parameters displayOnly=10 #normally 10 FILEname="not stated in header" prefixList = [] ################################################################ #gedFile=input("gedFile name") #gedFile="COLEDES4.GED" #no utf=8 error OLDEST #lastGood=10 #gedFile="SmithEg.ged" #no utf=8 error #lastGood=10 #gedFile="davidcole3.ged" #utf-8 error after nRec 10924 OLDEST & BIGGEST #lastGood=10925 #gedFile="ColesOfDevon.ged" #no utf=8 error #lastGood=10 #gedFile="ColesOfDevonDeLeg15L10.ged" #error after nRec 674 #lastGood=674 #gedFile="JohnsonSampleTree_asof_2022BFeb08.ged" #no utf=8 error #lastGood=10 #gedFile="Cole21CMar22_asof_22DApr06.ged" #error after 15983 #lastGood=15983 #gedFile="Fam1.GED" #no utf=8 error #lastGood=10 #gedFile="Cole21CMar22_asof_22DApr06_X15984.ged" #error after 15983 #lastGood=15983 #gedFile="Fam1.GED" #gedFile="ColeDavid2021LDec14_deFS.ged" # ColeDavid2021LDec14_deFS.ged gedFile="ColeDavid2021LDec14_deFS_viaGeany.ged" #gedFile="ColeDavid2021LDec14_deFS_202viaGeany.ged" #gedFile="ColeDavid2021LDec14_deFS_998viaGeany.ged" #gedFile="SmithEg.ged" print("gedFile:",gedFile) g=open(gedFile) nRec=0 gedDict={} gedHeadDict={} errList=[] isHead=False isPrevKey=False famList={"1 HUSB","1 WIFE","1 CHIL"} #print(famList) #for line in g: errCnt=0 FGraveCnt=0 FSidCnt=0 SiteCnt=0 DC0cnt=0 YLCcnt=0 firstUTF_recN=0 prevRec="" EMAILstr="" SOURstr="" DESTstr="" DATEstr="" # isHdrProcessed is True only after the header has been fully processed isHdrProcessed=False #print("at 87 isHdrProcessed:", isHdrProcessed) while True : nRec=nRec+1 strNRec=str(nRec) #print("strNRec[3:-1]:",strNRec) lenNRec=len(strNRec) suffixN=lenNRec-4 #print("strNRec[suffixN:lenNRec]:",strNRec[suffixN:lenNRec]) rhDigits=strNRec[suffixN:lenNRec] if rhDigits=="0000" : print(nRec) #input("?") #if strNRec[3:-1]==" #print("at 99") try: line = g.readline() except: #print("at 105") errCnt+=1 if errCnt==1 : #remember the # of the first error record firstUTF_recN=nRec #if end line = "bad+utf-8 error" +"\n" #print("Error reading nRec:",nRec," errCnt:",errCnt) #print("utf-8 err after:","nRec:",(nRec-1),"prevRec:",prevRec, end="") utf8Line="utf-8 err after:"+"nRec:"+str((nRec-1))+",prevRec:"+prevRec.strip() errList.append(utf8Line) #print("utf8Line:",utf8Line) finally: prevRec=line # at eof, break out out while loop if line == '' : break #line=g.read #if lastGood==10 : print(nRec) #if nRec>=(lastGood-5) and nRec0 :isFound=True if isFound: print("at 188 ", nRec,":",line.strip()) FGraveCnt+=1 #if end isFound=False if line.find("_FSFTID")>0 :isFound=True if line.find("TYPE FamilySearch ID")>0 :isFound=True #if line.find("FAMC")>0 :isFound=True #@S2@ if isFound: #print("at 198 ", nRec,":",line.strip()) FSidCnt+=1 #if end isFound=False #if line.find("Jessie /COLE/")>0 :isFound=True #L69G-PMF if line.find("L69G-PMF")>0 :isFound=True #Victor "Vic" Charles/Cole/ if line.find("L2VM-NSV")>0 :isFound=True #Jessie /Cole/ if line.find("L2VM-JL1")>0 :isFound=True #Margaret (Madge) Cole if line.find("LF5L-7Y1")>0 :isFound=True #YLC if isFound: print("at 208 ", nRec,":",line.strip()) YLCcnt+=1 #if end isFound=False if line[0:3]=="0 @": isPrevKey=True prevLine=line isHdrProcessed=True #print("at 216 setting True, isHdrProcessed:",isHdrProcessed) else: if isPrevKey==True: #print(prevLine[3:4]) if prevLine[3:4]=="I": gedDict[prevLine[2:8]]=line[7:-1]+"\n" isPrevKey=False #if end if prevLine[3:4]=="F": for f in famList: famEntity=f #print("|"+line[0:6]+"|") if line[0:6]==famEntity: gedDict[prevLine[2:8]+famEntity]=line #if end #for end #if end # only put Source definitions in the eDict dictionary if prevLine[3:4]=="S" : #gedDict[prevLine[2:-1]]=str(nRec)+"\n" gedDict[prevLine[2:-1]]=line[7:-1]+"\n" isPrevKey=False #if end #if line[0:3]=="0 @" end #print("at 240 isHdrProcessed;",isHdrProcessed) #if not isHead end #isHdrProcessed=True #print("at 243 isHdrProcessed;",isHdrProcessed) #print("nRec:",nRec) #if nRec<9555000 end #print("at 246") #finally end #print("at 248, nRec:",nRec) #while True end #print("at 258") print("end of GedExplore Analysis");print() prefixIcnt=0 prefixFcnt=0 prefixScnt=0 for prefix in ("@I","@F","@S"): displayCnt=0 prefix0Cnt=0 for a in gedDict: #print(":",a[0:2],":") if a[0:2]==prefix : #test=input("") displayCnt+=1 if displayCnt<=displayOnly : addLine=a+","+gedDict[a].strip() prefixList.append(addLine) #if end if prefix=="@I" : prefixIcnt+=1 if prefix=="@F" : prefixFcnt+=1 if prefix=="@S" : prefixScnt+=1 #if end #for end #print(gedDict) #for end len_gedDict=len(gedDict) print() print("GedExplore Summary:") print("-------------------") print("gedFile:",gedFile) print("SOURstr:",SOURstr) print("DESTstr:",DESTstr) print("DATEstr:",DATEstr) print("FILEname:",FILEname) print("EMAILstr:",EMAILstr) print("record Count:",nRec) print("errCnt:",errCnt) print("len_gedDict:",len_gedDict) print("Icnt:",prefixIcnt) print("Fcnt:",prefixFcnt) print("Scnt:",prefixScnt) print("FGraveCnt:",FGraveCnt) print("FSidCnt:",FSidCnt) print("SiteCnt:",SiteCnt) print("DC0cnt:",DC0cnt) print("YLCcnt:",YLCcnt) print("first UTF-8 err_recN:",firstUTF_recN) strIn=input("Hit Enter to suppress dump of Lists:") if strIn!="": print() print("Lists: headers, indexes & errors:") print("---------------------------------") #dump out the lists for strL in prefixList: print(strL) #for end cntN=0 for strL in errList: cntN+=1 if cntN==1: print("errList:") if cntN<10: print("strL:",strL) #for end #if end g.close() print() print("end of: "+progName)