本文整理汇总了C++中PBowDocBs类的典型用法代码示例。如果您正苦于以下问题:C++ PBowDocBs类的具体用法?C++ PBowDocBs怎么用?C++ PBowDocBs使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PBowDocBs类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: SaveLnDocTxt
void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){
TFOut SOut(FNm);
int Docs=BowDocBs->GetDocs();
for (int DId=0; DId<Docs; DId++){
printf("%d/%d\r", DId+1, Docs);
// output document-name
TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId));
SOut.PutStr(DocNm);
// output categories
for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){
int CId=BowDocBs->GetDocCId(DId, CIdN);
TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId));
SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm);
}
// output words
if (UseDocStrP){
TStr DocStr=BowDocBs->GetDocStr(DId);
// DocStr.DelChAll('\n'); DocStr.DelChAll('\r');
SOut.PutCh(' '); SOut.PutStr(DocStr);
} else {
int DocWIds=BowDocBs->GetDocWIds(DId);
int WId; double WordFq;
for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq);
TStr WordStr=BowDocBs->GetWordStr(WId);
for (int WordFqN=0; WordFqN<WordFq; WordFqN++){
SOut.PutCh(' '); SOut.PutStr(WordStr);
}
}
}
SOut.PutLn();
}
printf("\n");
}
示例2: SaveSparseMatlabTxt
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs,
const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm,
const TStr& CatFNm, const TIntV& _DIdV) {
TIntV DIdV;
if (_DIdV.Empty()) {
BowDocBs->GetAllDIdV(DIdV);
} else {
DIdV = _DIdV;
}
// generate map of row-ids to words
TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat"));
for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) {
TStr WdStr = BowDocBs->GetWordStr(WId);
WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1, WdStr.CStr()));
}
WdMapSOut.Flush();
// generate map of col-ids to document names
TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat"));
for (int DocN = 0; DocN < DIdV.Len(); DocN++) {
const int DId = DIdV[DocN];
TStr DocNm = BowDocBs->GetDocNm(DId);
DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId, DocNm.CStr()));
}
DocMapSOut.Flush();
// save documents' sparse vectors
TFOut SOut(FNm);
for (int DocN = 0; DocN < DIdV.Len(); DocN++){
const int DId = DIdV[DocN];
PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId);
const int DocWIds = DocSpV->GetWIds();
for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
const int WId = DocSpV->GetWId(DocWIdN);
const double WordWgt = DocSpV->GetWgt(DocWIdN);
SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt));
}
}
SOut.Flush();
// save documents' category sparse vectors
if (!CatFNm.Empty()) {
TFOut CatSOut(CatFNm);
for (int DocN = 0; DocN < DIdV.Len(); DocN++){
const int DId = DIdV[DocN];
const int DocCIds = BowDocBs->GetDocCIds(DId);
for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){
const int CId = BowDocBs->GetDocCId(DId, DocCIdN);
const double CatWgt = 1.0;
CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt));
}
}
CatSOut.Flush();
}
}
示例3: TMatrix
TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs,
const TStr& CatNm, const TIntV& DIdV, TFltV& ClsV): TMatrix() {
RowN = BowDocBs->GetWords();
ClsV.Gen(DIdV.Len(), 0);
ColSpVV.Gen(DIdV.Len(), 0);
IAssert(BowDocBs->IsCatNm(CatNm));
int CatId = BowDocBs->GetCId(CatNm);
for (int i = 0; i < DIdV.Len(); i++) {
ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i]));
ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99);
}
}
示例4: GetNmObjDIdV
void TNmObjBs::GetNmObjDIdV(
const PBowDocBs& BowDocBs, TIntV& BowDIdV,
const TStr& NmObjStr1, const TStr& NmObjStr2) const {
// get first named-object-id
int NmObjId1=GetNmObjId(NmObjStr1);
TIntV NmObjDocIdV1; GetNmObjDocIdV(NmObjId1, NmObjDocIdV1);
NmObjDocIdV1.Sort();
// get second named-object-id
TIntV NmObjDocIdV2;
if (!NmObjStr2.Empty()){
int NmObjId2=GetNmObjId(NmObjStr2);
GetNmObjDocIdV(NmObjId2, NmObjDocIdV2);
NmObjDocIdV2.Sort();
}
// create joint doc-id-vector
TIntV NmObjDocIdV;
if (NmObjDocIdV2.Empty()){
NmObjDocIdV=NmObjDocIdV1;
} else {
NmObjDocIdV1.Intrs(NmObjDocIdV2, NmObjDocIdV);
}
// traverse named-object-documents to collect bow-document-ids
BowDIdV.Gen(NmObjDocIdV.Len(), 0);
for (int NmObjDocIdN=0; NmObjDocIdN<NmObjDocIdV.Len(); NmObjDocIdN++){
TStr DocNm=GetDocNm(NmObjDocIdV[NmObjDocIdN]);
int DId=BowDocBs->GetDId(DocNm);
if (DId!=-1){
BowDIdV.Add(DId);
}
}
}
示例5: LoadLnDocTxt
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm,
TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) {
// open line-doc file
NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0;
while (!FIn.Eof()){
Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
printf("%d\r", Docs);
// document name
TChA DocNm;
Ch=FIn.GetCh();
if (NamedP){
while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
DocNm+=Ch; Ch=FIn.GetCh();}
DocNm.Trunc();
if (DocNm.Empty()){Docs--; continue;}
} else {
DocNm = TInt::GetStr(Docs);
}
// categories
TStrV CatNmV;
forever {
while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();}
if (Ch=='!'){
if (!FIn.Eof()){Ch=FIn.GetCh();}
TChA CatNm;
while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
CatNm+=Ch; Ch=FIn.GetCh();}
if (!CatNm.Empty()){CatNmV.Add(CatNm);}
} else {
break;
}
}
// document text
TChA DocChA;
while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){
DocChA+=Ch; Ch=FIn.GetCh();}
// skip empty documents (empty lines)
if (DocNm.Empty()&&DocChA.Empty()){
continue;}
// add document to document-base
NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP));
}
// return document-base
BowDocBs->AssertOk();
printf("\n");
}
示例6: AddWds
void TFtrGen::AddWds(const TStr& Prefix,
const PBowDocBs& BowDocBs, int& Offset) const {
const int Vals = GetVals();
for (int ValN = 0; ValN < Vals; ValN++) {
const int WId = BowDocBs->AddWordStr(
TStr::Fmt("%s-%s", Prefix.CStr(), GetVal(ValN).CStr()));
IAssert(Offset == WId); Offset++;
}
}
示例7: New
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs,
const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm,
const TIntV& TrainDIdV) {
// create model
TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs);
PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm;
// compute centroid
TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm);
for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) {
const int DId = TrainDIdV[TrainDIdN];
if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); }
}
PBowSim BowSim = TBowSim::New(bstCos);
PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV);
CentroidMd->CentroidV.Gen(BowDocBs->GetWords());
CentroidMd->CentroidV.PutAll(0.0);
TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV);
return CentroidMd;
}
示例8: GetOntoGroundNN
PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto,
const PBowDocBs& BowDocBs, const TStr& LangNm) {
printf("Generating Ontology-Classifier...\n");
// shortcuts
PLwTermBs TermBs=LwOnto->GetTermBs();
const int Terms = TermBs->GetTerms();
const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm);
const int Words = BowDocBs->GetWords();
// create tfidf
printf(" Creating BowDocWgtBs ...");
PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
PBowSim BowSim=TBowSim::New(bstCos);
printf(" Done.\n");
// collect documents per ontology-term
printf(" Collecting documents per ontology-term ... ");
THash<TInt, PBowSpV> TermIdToConceptSpVH;
for (int TermN = 0; TermN < Terms; TermN++){
int TermId = TermBs->GetTermId(TermN);
PLwTerm Term = TermBs->GetTerm(TermId);
if (Term->GetLangId() != LangId) { continue; }
// do nearest neighbour search
PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr(
Term->GetTermNm(), BowDocWgtBs);
TFltIntKdV SimDIdKdV;
BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false);
TFltV TermV(Words); TermV.PutAll(0.0);
for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) {
PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat);
const double Sim = SimDIdKdV[SimDIdKdN].Key;
TBowLinAlg::AddVec(Sim, DocSpV, TermV);
}
TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps));
}
printf(" Done.\n");
// create & return classifier
PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto,
BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
printf("Done.\n");
return OntoGround;
}
示例9: AddBowDoc
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs,
const TStr& DocNm, const TStrV& FtrValV) const {
TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV);
// make KdV to PrV
const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0);
for (int WIdN = 0; WIdN < WIds; WIdN++) {
WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat));
}
// add the feature vector to trainsets
BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV);
}
示例10: NewMulti
PBowMd TBowWinnowMd::NewMulti(
const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){
// create model
TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd);
// traverse categories
TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV);
for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){
// get category data
TStr CatNm=FqCatNmPrV[CatN].Val2;
int CId=BowDocBs->GetCId(CatNm);
// output header
printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n",
CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats());
// create model
PBowMd BowMd=New(BowDocBs, CatNm, Beta);
// add model to model-set
MultiMd->AddBowMd(BowMd);
}
// return model
return BowMd;
}
示例11: LoadHtmlTxt
/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
const bool& RecurseDirP, const int& MxDocs,
const bool& SaveDocP, const PNotify& Notify) {
// prepare file-directory traversal
TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
TFFile FFile(FPath, "", RecurseDirP);
// traverse files
TStr FNm; int Docs=0; NewDIdV.Clr();
while (FFile.Next(FNm)){
Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
Notify->OnStatus(TStr::Fmt("%d\r", Docs));
// prepare document-name
if (TFile::Exists(FNm)) { //B:
TStr DocNm=FNm.GetLc();
if (DocNm.IsPrefix(LcNrFPath)){
DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
// categories
TStrV CatNmV; TStr CatNm;
if (DocNm.IsChIn('/')){
TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
} else if (DocNm.IsChIn('\\')){
TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
}
if (!CatNm.Empty()){
CatNmV.Add(CatNm);}
// load document-content
TStr DocStr=TStr::LoadTxt(FNm);
// add document to bow
NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
}
}
Notify->OnStatus(TStr::Fmt("%d", Docs));
// return results
Notify->OnStatus("Done.");
BowDocBs->AssertOk();
}
示例12: GetDocCentroid
void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs,
const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs,
const uint64& MnTm, const int& TopWords, const double& TopWordsWgtSumPrc,
int& Docs, TStrFltPrV& WordStrWgtPrV) const {
// get doc-ids
TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
TIntV BowDIdV(DocIdV.Len(), 0);
for (int DocN=0; DocN<DocIdV.Len(); DocN++){
int DocId=DocIdV[DocN];
TStr BowDocNm=TInt::GetStr(DocId);
int BowDId=BowDocBs->GetDId(BowDocNm);
BowDIdV.Add(BowDId);
}
// create concept vector
PBowSim BowSim=TBowSim::New(bstCos); // similarity object
PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, BowDIdV);
// get docs & word-vector
Docs=DocIdV.Len();
ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV);
}
示例13: if
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId,
const TIntV& IgnoreIdV, const int& TrainLen) {
// feature generators
PFtrGenBs FtrGenBs = TFtrGenBs::New();
// CSV parsing stuff
PSIn SIn = TFIn::New(FNm);
char SsCh = ' '; TStrV FldValV;
// read the headers and initialise the feature generators
TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
const TStr& FldVal = FldValV[FldValN];
if (FldValN == ClassId) {
if (FldVal == "NOM") {
FtrGenBs->PutClsFtrGen(TFtrGenNominal::New());
} else if (FldVal == "MULTI-NOM") {
FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New());
} else {
TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!");
}
} else if (!IgnoreIdV.IsIn(FldValN)) {
if (FldVal == TFtrGenNumeric::GetType()) {
FtrGenBs->AddFtrGen(TFtrGenNumeric::New());
} else if (FldVal == TFtrGenNominal::GetType()) {
FtrGenBs->AddFtrGen(TFtrGenNominal::New());
} else if (FldVal == TFtrGenToken::GetType()) {
FtrGenBs->AddFtrGen(TFtrGenToken::New(
TSwSet::New(swstNone), TStemmer::New(stmtNone)));
} else if (FldVal == TFtrGenSparseNumeric::GetType()) {
FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New());
} else if (FldVal == TFtrGenMultiNom::GetType()) {
FtrGenBs->AddFtrGen(TFtrGenMultiNom::New());
} else {
TExcept::Throw("Wrong type '" + FldVal + "'!");
}
}
}
const int Flds = FldValV.Len();
// read the lines and feed them to the feature generators
int Recs = 0;
while (!SIn->Eof()) {
if (Recs == TrainLen) { break; }
Recs++; printf("%7d\r", Recs);
TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
// make sure line still has the same number of fields as the header
EAssertR(FldValV.Len() == Flds,
TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!",
Recs + 1, FldValV.Len(), Flds));
// go over lines
try {
TStrV FtrValV;
for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
const TStr& FldVal = FldValV[FldValN];
if (FldValN == ClassId) {
FtrGenBs->UpdateCls(FldVal);
} else if (!IgnoreIdV.IsIn(FldValN)) {
FtrValV.Add(FldVal);
}
}
FtrGenBs->Update(FtrValV);
} catch (PExcept Ex) {
TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!",
Recs+1, Ex->GetMsgStr().CStr()));
}
}
// read the file again and feed it to the training set
PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs();
// we read and ignore the headers since we parsed them already
SIn = TFIn::New(FNm); SsCh = ' ';
TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
// read the lines and feed them to the training set
Recs = 0;
while (!SIn->Eof()){
Recs++; printf("%7d\r", Recs);
TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
// make sure line still has the same number of fields as the header
EAssertR(FldValV.Len() == Flds,
TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!",
Recs + 1, FldValV.Len(), Flds));
// go over lines and construct the sparse vector
TStrV FtrValV; TStr ClsFtrVal;
try {
for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
const TStr& FldVal = FldValV[FldValN];
if (FldValN == ClassId) {
ClsFtrVal = FldVal;
} else if (!IgnoreIdV.IsIn(FldValN)) {
FtrValV.Add(FldVal);
}
}
} catch (PExcept Ex) {
TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!",
Recs+1, Ex->GetMsgStr().CStr()));
}
// add the feature vector to trainsets
FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal);
}
// prepare training and testing doc ids
TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted());
TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen);
//.........这里部分代码省略.........
示例14: main
int main(int argc, char* argv[]){
Try;
// create environment
Env=TEnv(argc, argv, TNotify::StdNotify);
// command line parameters
Env.PrepArgs("Text To Bag-Of-Words");
TStr InFPath=Env.GetIfArgPrefixStr("-ihtml:", "", "Input-Html-Path");
TStr InMtxFNm=Env.GetIfArgPrefixStr("-imtx:", "", "Input-Matrix-File");
TStr InTabFNm=Env.GetIfArgPrefixStr("-itab:", "", "Input-Tab-File");
TStr InTsactFNm=Env.GetIfArgPrefixStr("-itsc:", "", "Input-Transaction-File");
TStr InSparseFNm=Env.GetIfArgPrefixStr("-ispr:", "", "Input-Sparse-File");
TStr InSvmLightFNm=Env.GetIfArgPrefixStr("-isvml:", "", "Input-SvmLight-File");
TStr InCpdFNm=Env.GetIfArgPrefixStr("-icpd:", "", "Input-CompactDocuments-File");
TStr InTBsFNm=Env.GetIfArgPrefixStr("-itbs:", "", "Input-TextBase-File");
TStr InLnDocFNm=Env.GetIfArgPrefixStr("-ilndoc:", "", "Input-LineDocuments-File");
TStr InNmLnDocFNm=Env.GetIfArgPrefixStr("-inlndoc:", "", "Input-Named-LineDocuments-File");
TStr InReuters21578FPath=Env.GetIfArgPrefixStr("-ir21578:", "", "Input-Reuters21578-Path");
TStr InCiaWFBFPath=Env.GetIfArgPrefixStr("-iciawfb:", "", "Input-CIA-World-Fact-Book-Path");
TStr InDaxFNm=Env.GetIfArgPrefixStr("-idax:", "", "Input-DocumentAtlasXML-File");
TStr OutBowFNm=Env.GetIfArgPrefixStr("-o:", "Out.Bow", "Bow-Output-File (.Bow)");
bool OutStatP=Env.GetIfArgPrefixBool("-ostat:", true, "Output-Statistics (*.Txt)");
int Recs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents-To-Process");
bool RecurseDirP=Env.GetIfArgPrefixBool("-recurse:", false, "Recurse-Directories");
TStr SwSetTypeNm=Env.GetIfArgPrefixStr("-stopword:", "en523", "Stop-Word-Set "+TSwSet::GetSwSetTypeNmVStr());
TStr SwSetFNm=Env.GetIfArgPrefixStr("-istopword:", "", "External-Stop-Word-Set-File");
TStr StemmerTypeNm=Env.GetIfArgPrefixStr("-stemmer:", "porter", "Stemmer "+TStemmer::GetStemmerTypeNmVStr());
int MxNGramLen=Env.GetIfArgPrefixInt("-ngramlen:", 3, "Max-NGram-Length");
int MnNGramFq=Env.GetIfArgPrefixInt("-ngramfq:", 5, "Min-NGram-Frequency");
bool SaveDocP=Env.GetIfArgPrefixBool("-savedoc:", false, "Save-Document-Text");
if (Env.IsEndOfRun()){return 0;}
// -idir:f:\data\ciawfb\print -o:CiaWfb.Bow -docs:50
// -isvml:SvmLightTrain.Dat -o:SvmLight.Bow
// -ir21578:f:\data\Reuters21578 -o:Reuters21578.Bow
// -inlndoc:c:\data\yahoocompanies\CompProfilesSymbols.txt
// -ihtml:c:\data\cordis\fp6
// bag-of-words to create
PBowDocBs BowDocBs;
// load input data
if (!InFPath.Empty()){ // directory-files
// prepare stop-words
PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
if (!SwSetFNm.Empty()) { SwSet->LoadFromFile(SwSetFNm); }
// prepare stemmer
PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
// load bow
BowDocBs=TBowFl::LoadHtmlTxt(InFPath, RecurseDirP, Recs,
SwSet, Stemmer, MxNGramLen, MnNGramFq, SaveDocP);
} else
if (!InMtxFNm.Empty()){ // matrix-file
PBowSimMtx BowSimMtx=TBowSimMtx::LoadTxt(InMtxFNm);
BowDocBs=TBowFl::LoadFromSimMtx(BowSimMtx);
} else
if (!InTabFNm.Empty()){ // tab-file
BowDocBs=TBowFl::LoadTabTxt(InTabFNm, Recs);
} else
if (!InTsactFNm.Empty()){ // transaction-file
BowDocBs=TBowFl::LoadTsactTxt(InTsactFNm, Recs);
} else
if (!InSparseFNm.Empty()){ // sparse-file
TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
TStr TrainDataFNm=InSparseFNm;
BowDocBs=TBowFl::LoadSparseTxt(DocDefFNm, WordDefFNm, TrainDataFNm, Recs);
} else
if (!InSvmLightFNm.Empty()){ // SvmLight-file
TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
TStr TrainDataFNm=InSvmLightFNm;
BowDocBs=TBowFl::LoadSvmLightTxt(DocDefFNm, WordDefFNm, TrainDataFNm, "", Recs);
} else
if (!InTBsFNm.Empty()){ // Text-Base-file
//BowDocBs=TBowFl::LoadTBsTxt(InTBsFNm, Recs);
} else
if (!InCpdFNm.Empty()){ // Compact-Doc-file
BowDocBs=TBowFl::LoadCpdTxt(InCpdFNm, Recs,
SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
} else
if (!InLnDocFNm.Empty()){ // Line-Documents-file
BowDocBs=TBowFl::LoadLnDocTxt(InLnDocFNm, false, Recs,
SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
} else
if (!InNmLnDocFNm.Empty()){ // Named-Line-Documents-file
BowDocBs=TBowFl::LoadLnDocTxt(InNmLnDocFNm, true, Recs,
SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
} else
if (!InReuters21578FPath.Empty()){ // Reuters-21578-file
BowDocBs=TBowFl::LoadReuters21578Txt(InReuters21578FPath, Recs,
SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
} else
if (!InCiaWFBFPath.Empty()){ // CIA-World-Fact-Book
BowDocBs=TBowFl::LoadCiaWFBTxt(InCiaWFBFPath, Recs,
SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
} else
if (!InDaxFNm.Empty()) { // DocumentAtlasXml-File
PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
BowDocBs=TVizMapXmlDocBs::LoadBowDocBs(InDaxFNm,
//.........这里部分代码省略.........
示例15: GetOntoGround
PLwOntoGround TLwOntoGround::GetOntoGround(
const PLwOnto& LwOnto, const PBowDocBs& BowDocBs,
const TStr& LangNm, const bool& DocCatIsTermIdP,
const double& CutWordWgtSumPrc){
printf("Generating Ontology-Classifier...\n");
// shortcuts
PLwTermBs TermBs=LwOnto->GetTermBs();
int Terms=TermBs->GetTerms();
PLwLinkBs LinkBs=LwOnto->GetLinkBs();
PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs();
int LangId=LwOnto->GetLangBs()->GetLangId(LangNm);
int Docs=BowDocBs->GetDocs();
// create tfidf
printf(" Creating BowDocWgtBs ...");
PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
PBowSim BowSim=TBowSim::New(bstCos);
printf(" Done.\n");
// collect documents per ontology-term
printf(" Collecting documents per ontology-term ...\n");
TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0;
for (int DId=0; DId<Docs; DId++){
printf(" Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats);
for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){
// get document-category
int CId=BowDocBs->GetDocCId(DId, DocCIdN);
TStr CatNm=BowDocBs->GetCatNm(CId);
// get term-id
if (DocCatIsTermIdP){
int TermId=CatNm.GetInt();
if (TermBs->IsTermId(TermId)){
TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
} else {NegCats++;}
} else {
if (TermBs->IsTermId(CatNm, LangId)){
int TermId=TermBs->GetTermId(CatNm, LangId);
TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
} else {NegCats++;}
}
}
}
printf(" Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats);
printf(" Done.\n");
// create sub-terms & up-terms vectors
printf(" Creating sub-terms & up-terms vectors ...");
TIntIntVH Const_TermIdToSubTermIdVH;
TIntIntVH TermIdToSubTermIdVH;
TIntIntVH TermIdToUpTermIdVH;
for (int TermN=0; TermN<Terms; TermN++){
int TermId=TermBs->GetTermId(TermN);
for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){
int LinkTypeId; int DstTermId;
LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId);
TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm();
if (LinkTypeNm=="NT"){
Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId);
}
}
}
printf(" Done.\n");
// create centroids
printf(" Creating centroids ...\n");
THash<TInt, PBowSpV> TermIdToConceptSpVH;
TIntIntVH TermIdToSubTermDIdVH;
TIntH ProcTermIdH;
int PrevActiveTerms=-1;
forever{
// count active nodes for processing
int ActiveTerms=0;
for (int TermN=0; TermN<Terms; TermN++){
int TermId=TermBs->GetTermId(TermN);
if ((TermIdToSubTermIdVH.IsKey(TermId))&&
(TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){
ActiveTerms++;
}
}
// stop if no change from previous round
printf(" Active-Terms:%d\n", ActiveTerms);
if (ActiveTerms==PrevActiveTerms){break;}
PrevActiveTerms=ActiveTerms;
// reduce active-nodes with zero-ancestors
for (int TermN=0; TermN<Terms; TermN++){
int TermId=TermBs->GetTermId(TermN);
if (ProcTermIdH.IsKey(TermId)){continue;}
if ((!TermIdToSubTermIdVH.IsKey(TermId))||
(TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){
printf(" %d/%d\r", 1+TermN, Terms);
ProcTermIdH.AddKey(TermId);
// collect document-ids
TIntV TermDIdV;
if (TermIdToDIdVH.IsKey(TermId)){
TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));}
if (TermIdToSubTermDIdVH.IsKey(TermId)){
TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));}
// create concept-vector if any documents
if (TermDIdV.Len()>0){
PBowSpV ConceptSpV=
TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc);
TermIdToConceptSpVH.AddDat(TermId, ConceptSpV);
//.........这里部分代码省略.........