当前位置: 首页>>代码示例>>C++>>正文


C++ PBowDocBs类代码示例

本文整理汇总了C++中PBowDocBs的典型用法代码示例。如果您正苦于以下问题:C++ PBowDocBs类的具体用法?C++ PBowDocBs怎么用?C++ PBowDocBs使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了PBowDocBs类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: SaveLnDocTxt

void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){
  TFOut SOut(FNm);
  int Docs=BowDocBs->GetDocs();
  for (int DId=0; DId<Docs; DId++){
    printf("%d/%d\r", DId+1, Docs);
    // output document-name
    TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId));
    SOut.PutStr(DocNm);
    // output categories
    for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){
      int CId=BowDocBs->GetDocCId(DId, CIdN);
      TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId));
      SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm);
    }
    // output words
    if (UseDocStrP){
      TStr DocStr=BowDocBs->GetDocStr(DId);
//      DocStr.DelChAll('\n'); DocStr.DelChAll('\r');
      SOut.PutCh(' '); SOut.PutStr(DocStr);
    } else {
        int DocWIds=BowDocBs->GetDocWIds(DId);
        int WId; double WordFq;
        for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
          BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq);
          TStr WordStr=BowDocBs->GetWordStr(WId);
          for (int WordFqN=0; WordFqN<WordFq; WordFqN++){
            SOut.PutCh(' '); SOut.PutStr(WordStr);
          }
        }
    }
    SOut.PutLn();
  }
  printf("\n");
}
开发者ID:Accio,项目名称:snap,代码行数:34,代码来源:bowfl.cpp

示例2: SaveSparseMatlabTxt

void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs,
    const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm,
    const TStr& CatFNm, const TIntV& _DIdV) {

  TIntV DIdV;
  if (_DIdV.Empty()) {
      BowDocBs->GetAllDIdV(DIdV);
  } else {
      DIdV = _DIdV;
  }
  // generate map of row-ids to words
  TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat"));
  for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) {
    TStr WdStr = BowDocBs->GetWordStr(WId);
    WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1,  WdStr.CStr()));
  }
  WdMapSOut.Flush();
  // generate map of col-ids to document names
  TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat"));
  for (int DocN = 0; DocN < DIdV.Len(); DocN++) {
    const int DId = DIdV[DocN];
    TStr DocNm = BowDocBs->GetDocNm(DId);
    DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId,  DocNm.CStr()));
  }
  DocMapSOut.Flush();
  // save documents' sparse vectors
  TFOut SOut(FNm);
  for (int DocN = 0; DocN < DIdV.Len(); DocN++){
    const int DId = DIdV[DocN];
    PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId);
    const int DocWIds = DocSpV->GetWIds();
    for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
      const int WId = DocSpV->GetWId(DocWIdN);
      const double WordWgt = DocSpV->GetWgt(DocWIdN);
      SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt));
    }
  }
  SOut.Flush();
  // save documents' category sparse vectors
  if (!CatFNm.Empty()) {
    TFOut CatSOut(CatFNm);
    for (int DocN = 0; DocN < DIdV.Len(); DocN++){
      const int DId = DIdV[DocN];
      const int DocCIds = BowDocBs->GetDocCIds(DId);
      for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){
        const int CId = BowDocBs->GetDocCId(DId, DocCIdN);
        const double CatWgt = 1.0;
        CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt));
      }
    }
    CatSOut.Flush();
  }
}
开发者ID:Accio,项目名称:snap,代码行数:53,代码来源:bowflx.cpp

示例3: TMatrix

TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs,
        const TStr& CatNm,  const TIntV& DIdV, TFltV& ClsV): TMatrix() {

    RowN = BowDocBs->GetWords();
    ClsV.Gen(DIdV.Len(), 0);
    ColSpVV.Gen(DIdV.Len(), 0);
    IAssert(BowDocBs->IsCatNm(CatNm));
    int CatId = BowDocBs->GetCId(CatNm);
    for (int i = 0; i < DIdV.Len(); i++) {
        ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i]));
        ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99);
    }
}
开发者ID:adobekan,项目名称:qminer,代码行数:13,代码来源:bowlinalg.cpp

示例4: GetNmObjDIdV

void TNmObjBs::GetNmObjDIdV(
 const PBowDocBs& BowDocBs, TIntV& BowDIdV, 
 const TStr& NmObjStr1, const TStr& NmObjStr2) const {
  // get first named-object-id
  int NmObjId1=GetNmObjId(NmObjStr1);
  TIntV NmObjDocIdV1; GetNmObjDocIdV(NmObjId1, NmObjDocIdV1);
  NmObjDocIdV1.Sort();
  // get second named-object-id
  TIntV NmObjDocIdV2;
  if (!NmObjStr2.Empty()){
    int NmObjId2=GetNmObjId(NmObjStr2);
    GetNmObjDocIdV(NmObjId2, NmObjDocIdV2);
    NmObjDocIdV2.Sort();
  }
  // create joint doc-id-vector
  TIntV NmObjDocIdV;
  if (NmObjDocIdV2.Empty()){
    NmObjDocIdV=NmObjDocIdV1;
  } else {
    NmObjDocIdV1.Intrs(NmObjDocIdV2, NmObjDocIdV);
  }
  // traverse named-object-documents to collect bow-document-ids
  BowDIdV.Gen(NmObjDocIdV.Len(), 0);
  for (int NmObjDocIdN=0; NmObjDocIdN<NmObjDocIdV.Len(); NmObjDocIdN++){
    TStr DocNm=GetDocNm(NmObjDocIdV[NmObjDocIdN]);
    int DId=BowDocBs->GetDId(DocNm);
    if (DId!=-1){
      BowDIdV.Add(DId);
    } 
  }
}
开发者ID:AlertProject,项目名称:Text-processing-bundle,代码行数:31,代码来源:nmobj.cpp

示例5: LoadLnDocTxt

void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm,
 TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) {
  // open line-doc file
  NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0;
  while (!FIn.Eof()){
    Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    printf("%d\r", Docs);
    // document name
    TChA DocNm;
    Ch=FIn.GetCh();
    if (NamedP){
      while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
        DocNm+=Ch; Ch=FIn.GetCh();}
      DocNm.Trunc();
      if (DocNm.Empty()){Docs--; continue;}
    } else {
        DocNm = TInt::GetStr(Docs);
    }
    // categories
    TStrV CatNmV;
    forever {
      while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();}
      if (Ch=='!'){
        if (!FIn.Eof()){Ch=FIn.GetCh();}
        TChA CatNm;
        while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
          CatNm+=Ch; Ch=FIn.GetCh();}
        if (!CatNm.Empty()){CatNmV.Add(CatNm);}
      } else {
        break;
      }
    }
    // document text
    TChA DocChA;
    while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){
      DocChA+=Ch; Ch=FIn.GetCh();}
    // skip empty documents (empty lines)
    if (DocNm.Empty()&&DocChA.Empty()){
      continue;}
    // add document to document-base
    NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP));
  }
  // return document-base
  BowDocBs->AssertOk();
  printf("\n");
}
开发者ID:Accio,项目名称:snap,代码行数:46,代码来源:bowfl.cpp

示例6: AddWds

void TFtrGen::AddWds(const TStr& Prefix, 
        const PBowDocBs& BowDocBs, int& Offset) const {

    const int Vals = GetVals();
    for (int ValN = 0; ValN < Vals; ValN++) {
        const int WId = BowDocBs->AddWordStr(
            TStr::Fmt("%s-%s", Prefix.CStr(), GetVal(ValN).CStr()));
        IAssert(Offset == WId); Offset++;
    }
}
开发者ID:mkarlovc,项目名称:gcentralization,代码行数:10,代码来源:ftrgen.cpp

示例7: New

PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, 
		const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, 
		const TIntV& TrainDIdV) {

	// create model
	TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); 
	PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm;
	// compute centroid
	TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm);
	for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) {
		const int DId = TrainDIdV[TrainDIdN];
		if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); }
	}
	PBowSim BowSim = TBowSim::New(bstCos);
	PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV);	
	CentroidMd->CentroidV.Gen(BowDocBs->GetWords());
	CentroidMd->CentroidV.PutAll(0.0);
	TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV);
	return CentroidMd;
}
开发者ID:Austindeadhead,项目名称:qminer,代码行数:20,代码来源:bowmd.cpp

示例8: GetOntoGroundNN

PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto, 
        const PBowDocBs& BowDocBs, const TStr& LangNm) {

    printf("Generating Ontology-Classifier...\n");
    // shortcuts
    PLwTermBs TermBs=LwOnto->GetTermBs();
    const int Terms = TermBs->GetTerms();
    const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm);
    const int Words = BowDocBs->GetWords();
    // create tfidf
    printf("  Creating BowDocWgtBs ...");
    PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
    PBowSim BowSim=TBowSim::New(bstCos);
    printf(" Done.\n");
    // collect documents per ontology-term
    printf("  Collecting documents per ontology-term ... ");
    THash<TInt, PBowSpV> TermIdToConceptSpVH;
    for (int TermN = 0; TermN < Terms; TermN++){
        int TermId = TermBs->GetTermId(TermN);
        PLwTerm Term = TermBs->GetTerm(TermId);
        if (Term->GetLangId() != LangId) { continue; }
        // do nearest neighbour search
        PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr(
            Term->GetTermNm(), BowDocWgtBs);
        TFltIntKdV SimDIdKdV;
        BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false);
        TFltV TermV(Words); TermV.PutAll(0.0);
        for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) {
            PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat);
            const double Sim = SimDIdKdV[SimDIdKdN].Key;
            TBowLinAlg::AddVec(Sim, DocSpV, TermV);
        }
        TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps));
    }
    printf("  Done.\n");
    // create & return classifier
    PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto, 
        BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
    printf("Done.\n");
    return OntoGround;
}
开发者ID:Accio,项目名称:snap,代码行数:41,代码来源:ontolight.cpp

示例9: AddBowDoc

void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs,
		const TStr& DocNm, const TStrV& FtrValV) const {

    TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV);
    // make KdV to PrV
    const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0);
    for (int WIdN = 0; WIdN < WIds; WIdN++) {
        WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat));
    }
    // add the feature vector to trainsets
    BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV);
}
开发者ID:mkarlovc,项目名称:gcentralization,代码行数:12,代码来源:ftrgen.cpp

示例10: NewMulti

PBowMd TBowWinnowMd::NewMulti(
 const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){
  // create model
  TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd);
  // traverse categories
  TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV);
  for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){
    // get category data
    TStr CatNm=FqCatNmPrV[CatN].Val2;
    int CId=BowDocBs->GetCId(CatNm);
    // output header
    printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n",
     CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats());
    // create model
    PBowMd BowMd=New(BowDocBs, CatNm, Beta);
    // add model to model-set
    MultiMd->AddBowMd(BowMd);
  }
  // return model
  return BowMd;
}
开发者ID:Austindeadhead,项目名称:qminer,代码行数:21,代码来源:bowmd.cpp

示例11: LoadHtmlTxt

/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
 PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
 const bool& RecurseDirP, const int& MxDocs,
 const bool& SaveDocP, const PNotify& Notify) {
  // prepare file-directory traversal
  TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
  Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
  TFFile FFile(FPath, "", RecurseDirP);
  // traverse files
  TStr FNm; int Docs=0; NewDIdV.Clr();
  while (FFile.Next(FNm)){
    Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
    Notify->OnStatus(TStr::Fmt("%d\r", Docs));
    // prepare document-name
    if (TFile::Exists(FNm)) { //B:
        TStr DocNm=FNm.GetLc();
        if (DocNm.IsPrefix(LcNrFPath)){
          DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
        // categories
        TStrV CatNmV; TStr CatNm;
        if (DocNm.IsChIn('/')){
          TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
        } else if (DocNm.IsChIn('\\')){
          TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
        }
        if (!CatNm.Empty()){
          CatNmV.Add(CatNm);}
        // load document-content
        TStr DocStr=TStr::LoadTxt(FNm);
        // add document to bow
        NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
    }
  }
  Notify->OnStatus(TStr::Fmt("%d", Docs));
  // return results
  Notify->OnStatus("Done.");
  BowDocBs->AssertOk();
}
开发者ID:Accio,项目名称:snap,代码行数:40,代码来源:bowfl.cpp

示例12: GetDocCentroid

void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs,
 const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs,
 const uint64& MnTm, const int& TopWords, const double& TopWordsWgtSumPrc,
 int& Docs, TStrFltPrV& WordStrWgtPrV) const {
  // get doc-ids
  TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
  TIntV BowDIdV(DocIdV.Len(), 0);
  for (int DocN=0; DocN<DocIdV.Len(); DocN++){
    int DocId=DocIdV[DocN];
    TStr BowDocNm=TInt::GetStr(DocId);
    int BowDId=BowDocBs->GetDId(BowDocNm);
    BowDIdV.Add(BowDId);
  }
  // create concept vector
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, BowDIdV);
  // get docs & word-vector
  Docs=DocIdV.Len();
  ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV);
}
开发者ID:Accio,项目名称:snap,代码行数:20,代码来源:skygrid.cpp

示例13: if

PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, 
        const TIntV& IgnoreIdV, const int& TrainLen) {

    // feature generators
	PFtrGenBs FtrGenBs = TFtrGenBs::New();
    // CSV parsing stuff
    PSIn SIn = TFIn::New(FNm); 
    char SsCh = ' '; TStrV FldValV;
    // read the headers and initialise the feature generators
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
        const TStr& FldVal = FldValV[FldValN];
        if (FldValN == ClassId) { 
            if (FldVal == "NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenNominal::New());
            } else if (FldVal == "MULTI-NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!");
            }
        } else if (!IgnoreIdV.IsIn(FldValN)) {
            if (FldVal == TFtrGenNumeric::GetType()) {
				FtrGenBs->AddFtrGen(TFtrGenNumeric::New());
            } else if (FldVal == TFtrGenNominal::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenNominal::New());
            } else if (FldVal == TFtrGenToken::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenToken::New(
                    TSwSet::New(swstNone), TStemmer::New(stmtNone)));
            } else if (FldVal == TFtrGenSparseNumeric::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New());
            } else if (FldVal == TFtrGenMultiNom::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong type '" + FldVal + "'!");
            }
        }
    }
    const int Flds = FldValV.Len();
    // read the lines and feed them to the feature generators
    int Recs = 0;
    while (!SIn->Eof()) {
        if (Recs == TrainLen) { break; }
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines
        try {
			TStrV FtrValV;
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
					FtrGenBs->UpdateCls(FldVal);
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
			FtrGenBs->Update(FtrValV);
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
    }
    // read the file again and feed it to the training set
    PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs();
    // we read and ignore the headers since we parsed them already 
    SIn = TFIn::New(FNm); SsCh = ' ';
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    // read the lines and feed them to the training set
    Recs = 0;
    while (!SIn->Eof()){
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines and construct the sparse vector
		TStrV FtrValV; TStr ClsFtrVal;
        try {
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
                    ClsFtrVal = FldVal;
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
        // add the feature vector to trainsets
		FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal);
    }
	// prepare training and testing doc ids
	TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted());
	TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen);
//.........这里部分代码省略.........
开发者ID:mkarlovc,项目名称:gcentralization,代码行数:101,代码来源:ftrgen.cpp

示例14: main

int main(int argc, char* argv[]){
  Try;
  // create environment
  Env=TEnv(argc, argv, TNotify::StdNotify);

  // command line parameters
  Env.PrepArgs("Text To Bag-Of-Words");
  TStr InFPath=Env.GetIfArgPrefixStr("-ihtml:", "", "Input-Html-Path");
  TStr InMtxFNm=Env.GetIfArgPrefixStr("-imtx:", "", "Input-Matrix-File");
  TStr InTabFNm=Env.GetIfArgPrefixStr("-itab:", "", "Input-Tab-File");
  TStr InTsactFNm=Env.GetIfArgPrefixStr("-itsc:", "", "Input-Transaction-File");
  TStr InSparseFNm=Env.GetIfArgPrefixStr("-ispr:", "", "Input-Sparse-File");
  TStr InSvmLightFNm=Env.GetIfArgPrefixStr("-isvml:", "", "Input-SvmLight-File");
  TStr InCpdFNm=Env.GetIfArgPrefixStr("-icpd:", "", "Input-CompactDocuments-File");
  TStr InTBsFNm=Env.GetIfArgPrefixStr("-itbs:", "", "Input-TextBase-File");
  TStr InLnDocFNm=Env.GetIfArgPrefixStr("-ilndoc:", "", "Input-LineDocuments-File");
  TStr InNmLnDocFNm=Env.GetIfArgPrefixStr("-inlndoc:", "", "Input-Named-LineDocuments-File");
  TStr InReuters21578FPath=Env.GetIfArgPrefixStr("-ir21578:", "", "Input-Reuters21578-Path");
  TStr InCiaWFBFPath=Env.GetIfArgPrefixStr("-iciawfb:", "", "Input-CIA-World-Fact-Book-Path");
  TStr InDaxFNm=Env.GetIfArgPrefixStr("-idax:", "", "Input-DocumentAtlasXML-File");
  TStr OutBowFNm=Env.GetIfArgPrefixStr("-o:", "Out.Bow", "Bow-Output-File (.Bow)");
  bool OutStatP=Env.GetIfArgPrefixBool("-ostat:", true, "Output-Statistics (*.Txt)");
  int Recs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents-To-Process");
  bool RecurseDirP=Env.GetIfArgPrefixBool("-recurse:", false, "Recurse-Directories");
  TStr SwSetTypeNm=Env.GetIfArgPrefixStr("-stopword:", "en523", "Stop-Word-Set "+TSwSet::GetSwSetTypeNmVStr());
  TStr SwSetFNm=Env.GetIfArgPrefixStr("-istopword:", "", "External-Stop-Word-Set-File");
  TStr StemmerTypeNm=Env.GetIfArgPrefixStr("-stemmer:", "porter", "Stemmer "+TStemmer::GetStemmerTypeNmVStr());
  int MxNGramLen=Env.GetIfArgPrefixInt("-ngramlen:", 3, "Max-NGram-Length");
  int MnNGramFq=Env.GetIfArgPrefixInt("-ngramfq:", 5, "Min-NGram-Frequency");
  bool SaveDocP=Env.GetIfArgPrefixBool("-savedoc:", false, "Save-Document-Text");
  if (Env.IsEndOfRun()){return 0;}
  // -idir:f:\data\ciawfb\print -o:CiaWfb.Bow -docs:50
  // -isvml:SvmLightTrain.Dat -o:SvmLight.Bow
  // -ir21578:f:\data\Reuters21578 -o:Reuters21578.Bow
  // -inlndoc:c:\data\yahoocompanies\CompProfilesSymbols.txt
  // -ihtml:c:\data\cordis\fp6

  // bag-of-words to create
  PBowDocBs BowDocBs;

  // load input data
  if (!InFPath.Empty()){ // directory-files
    // prepare stop-words
    PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
    if (!SwSetFNm.Empty()) { SwSet->LoadFromFile(SwSetFNm); }
    // prepare stemmer
    PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
    // load bow
    BowDocBs=TBowFl::LoadHtmlTxt(InFPath, RecurseDirP, Recs,
     SwSet, Stemmer, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InMtxFNm.Empty()){ // matrix-file
    PBowSimMtx BowSimMtx=TBowSimMtx::LoadTxt(InMtxFNm);
    BowDocBs=TBowFl::LoadFromSimMtx(BowSimMtx);
  } else
  if (!InTabFNm.Empty()){ // tab-file
    BowDocBs=TBowFl::LoadTabTxt(InTabFNm, Recs);
  } else
  if (!InTsactFNm.Empty()){ // transaction-file
    BowDocBs=TBowFl::LoadTsactTxt(InTsactFNm, Recs);
  } else
  if (!InSparseFNm.Empty()){ // sparse-file
    TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
    TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
    TStr TrainDataFNm=InSparseFNm;
    BowDocBs=TBowFl::LoadSparseTxt(DocDefFNm, WordDefFNm, TrainDataFNm, Recs);
  } else
  if (!InSvmLightFNm.Empty()){ // SvmLight-file
    TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
    TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
    TStr TrainDataFNm=InSvmLightFNm;
    BowDocBs=TBowFl::LoadSvmLightTxt(DocDefFNm, WordDefFNm, TrainDataFNm, "", Recs);
  } else
  if (!InTBsFNm.Empty()){ // Text-Base-file
    //BowDocBs=TBowFl::LoadTBsTxt(InTBsFNm, Recs);
  } else
  if (!InCpdFNm.Empty()){ // Compact-Doc-file
    BowDocBs=TBowFl::LoadCpdTxt(InCpdFNm, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else
  if (!InLnDocFNm.Empty()){ // Line-Documents-file
    BowDocBs=TBowFl::LoadLnDocTxt(InLnDocFNm, false, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InNmLnDocFNm.Empty()){ // Named-Line-Documents-file
    BowDocBs=TBowFl::LoadLnDocTxt(InNmLnDocFNm, true, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InReuters21578FPath.Empty()){ // Reuters-21578-file
    BowDocBs=TBowFl::LoadReuters21578Txt(InReuters21578FPath, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else 
  if (!InCiaWFBFPath.Empty()){ // CIA-World-Fact-Book
    BowDocBs=TBowFl::LoadCiaWFBTxt(InCiaWFBFPath, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else 
  if (!InDaxFNm.Empty()) { // DocumentAtlasXml-File
    PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
    PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
    BowDocBs=TVizMapXmlDocBs::LoadBowDocBs(InDaxFNm, 
//.........这里部分代码省略.........
开发者ID:,项目名称:,代码行数:101,代码来源:

示例15: GetOntoGround

PLwOntoGround TLwOntoGround::GetOntoGround(
 const PLwOnto& LwOnto, const PBowDocBs& BowDocBs,
 const TStr& LangNm, const bool& DocCatIsTermIdP,
 const double& CutWordWgtSumPrc){
  printf("Generating Ontology-Classifier...\n");
  // shortcuts
  PLwTermBs TermBs=LwOnto->GetTermBs();
  int Terms=TermBs->GetTerms();
  PLwLinkBs LinkBs=LwOnto->GetLinkBs();
  PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs();
  int LangId=LwOnto->GetLangBs()->GetLangId(LangNm);
  int Docs=BowDocBs->GetDocs();
  // create tfidf
  printf("  Creating BowDocWgtBs ...");
  PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
  PBowSim BowSim=TBowSim::New(bstCos);
  printf(" Done.\n");
  // collect documents per ontology-term
  printf("  Collecting documents per ontology-term ...\n");
  TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0;
  for (int DId=0; DId<Docs; DId++){
    printf("    Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats);
    for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){
      // get document-category
      int CId=BowDocBs->GetDocCId(DId, DocCIdN);
      TStr CatNm=BowDocBs->GetCatNm(CId);
      // get term-id
      if (DocCatIsTermIdP){
        int TermId=CatNm.GetInt();
        if (TermBs->IsTermId(TermId)){
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      } else {
        if (TermBs->IsTermId(CatNm, LangId)){
          int TermId=TermBs->GetTermId(CatNm, LangId);
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      }
    }
  }
  printf("    Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats);
  printf("  Done.\n");
  // create sub-terms & up-terms vectors
  printf("  Creating sub-terms & up-terms vectors ...");
  TIntIntVH Const_TermIdToSubTermIdVH;
  TIntIntVH TermIdToSubTermIdVH;
  TIntIntVH TermIdToUpTermIdVH;
  for (int TermN=0; TermN<Terms; TermN++){
    int TermId=TermBs->GetTermId(TermN);
    for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){
      int LinkTypeId; int DstTermId;
      LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId);
      TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm();
      if (LinkTypeNm=="NT"){
        Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId);
      }
    }
  }
  printf("   Done.\n");
  // create centroids
  printf("  Creating centroids ...\n");
  THash<TInt, PBowSpV> TermIdToConceptSpVH;
  TIntIntVH TermIdToSubTermDIdVH;
  TIntH ProcTermIdH;
  int PrevActiveTerms=-1;
  forever{
    // count active nodes for processing
    int ActiveTerms=0;
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if ((TermIdToSubTermIdVH.IsKey(TermId))&&
       (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){
        ActiveTerms++;
      }
    }
    // stop if no change from previous round
    printf("    Active-Terms:%d\n", ActiveTerms);
    if (ActiveTerms==PrevActiveTerms){break;}
    PrevActiveTerms=ActiveTerms;
    // reduce active-nodes with zero-ancestors
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if (ProcTermIdH.IsKey(TermId)){continue;}
      if ((!TermIdToSubTermIdVH.IsKey(TermId))||
       (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){
        printf("    %d/%d\r", 1+TermN, Terms);
        ProcTermIdH.AddKey(TermId);
        // collect document-ids
        TIntV TermDIdV;
        if (TermIdToDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));}
        if (TermIdToSubTermDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));}
        // create concept-vector if any documents
        if (TermDIdV.Len()>0){
          PBowSpV ConceptSpV=
           TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc);
          TermIdToConceptSpVH.AddDat(TermId, ConceptSpV);
//.........这里部分代码省略.........
开发者ID:Accio,项目名称:snap,代码行数:101,代码来源:ontolight.cpp


注:本文中的PBowDocBs类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。