From 87f61031655c2b85e9e2b384398e800febeaf89a Mon Sep 17 00:00:00 2001 From: Nate Parrott Date: Thu, 15 Jan 2015 19:24:22 -0500 Subject: [PATCH] I've made a huge mistake fix all parser accuracy regressions with this one weird trick! (+ several other experimental changes) --- .../Parser/NSString+PSTokenize.m | 5 +- .../Parser/PSProbabilityCounter.h | 1 + .../Parser/PSProbabilityCounter.m | 6 ++ .../FlashlightKit/Parser/Parsnip.m | 4 +- .../FlashlightTool.xcodeproj/project.pbxproj | 69 +++++++++++++++++-- 5 files changed, 76 insertions(+), 9 deletions(-) diff --git a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/NSString+PSTokenize.m b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/NSString+PSTokenize.m index ad137b0f..d5b93b0d 100644 --- a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/NSString+PSTokenize.m +++ b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/NSString+PSTokenize.m @@ -35,6 +35,7 @@ - (NSArray *)ps_tokenize { NSMutableArray *tokens = [NSMutableArray new]; NSLinguisticTagger *tagger = [[NSLinguisticTagger alloc] initWithTagSchemes:@[NSLinguisticTagSchemeTokenType] options:0]; tagger.string = self; + __block NSString *prevText = nil; [tagger enumerateTagsInRange:NSMakeRange(0, self.length) scheme:NSLinguisticTagSchemeTokenType options:0 usingBlock:^(NSString *tag, NSRange tokenRange, NSRange sentenceRange, BOOL *stop) { NSString *text = [self substringWithRange:tokenRange]; if ([tag isEqualToString:NSLinguisticTagWhitespace]) { @@ -43,7 +44,9 @@ - (NSArray *)ps_tokenize { } else { PSToken *token = [PSToken new]; token.original = text; - token.features = @[token.original]; + NSString *bigram = [NSString stringWithFormat:@"%@-%@", prevText, text]; + prevText = text; + token.features = @[token.original, bigram]; [tokens addObject:token]; } }]; diff --git a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.h b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.h index dfd32945..6d4442a2 100644 --- a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.h +++ b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.h @@ -14,5 +14,6 @@ - (void)addItem:(id)item; - (NSEnumerator *)allItems; - (double)smoothedLogProbForItem:(id)item; +- (double)specialTextProbabilityForItem:(id)item; @end diff --git a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.m b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.m index 69573749..0522f6c6 100644 --- a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.m +++ b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/PSProbabilityCounter.m @@ -51,4 +51,10 @@ - (void)ps_mergeWith:(PSProbabilityCounter *)other allowUnmergeableTypes:(BOOL)a } } +- (double)specialTextProbabilityForItem:(id)item { + double count = [self.countsForItems[item] doubleValue]; + double p = 1 - 1.0 / (count + 1); + return PSLogProb(PSMinimalProbability + p * (1 - PSMinimalProbability)); +} + @end diff --git a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/Parsnip.m b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/Parsnip.m index 6420be16..7e144f4a 100644 --- a/FlashlightApp/FlashlightKit/FlashlightKit/Parser/Parsnip.m +++ b/FlashlightApp/FlashlightKit/FlashlightKit/Parser/Parsnip.m @@ -160,12 +160,12 @@ - (void)addNewCandidatesToDictionary:(NSMutableDictionary *)dict withCandidate:( - (double)logProbOfEmissionOfToken:(PSToken *)token fromTerminalNodeNamed:(NSString *)tagName { if ([PSTerminalNode isNameOfFreeTextNode:tagName]) { - return PSSmoothLogProb(PSLogProb(PSMinimalProbability)); + return PSSmoothLogProb(PSLogProb(PSFreeTextProbability)); } else { PSProbabilityCounter *counter = self.emissionProbs[tagName]; double logProb = 0; for (id feature in token.features) { - logProb += [counter smoothedLogProbForItem:feature]; + logProb += [counter specialTextProbabilityForItem:feature]; } return logProb; } diff --git a/FlashlightApp/FlashlightTool/FlashlightTool.xcodeproj/project.pbxproj b/FlashlightApp/FlashlightTool/FlashlightTool.xcodeproj/project.pbxproj index a92e2fb8..025fe333 100644 --- a/FlashlightApp/FlashlightTool/FlashlightTool.xcodeproj/project.pbxproj +++ b/FlashlightApp/FlashlightTool/FlashlightTool.xcodeproj/project.pbxproj @@ -7,7 +7,7 @@ objects = { /* Begin PBXBuildFile section */ - 5F249B291A5CFCA3001FEF8F /* FlashlightKit.framework in CopyFiles */ = {isa = PBXBuildFile; fileRef = 5FC13E5D1A4E13B8008A0FE3 /* FlashlightKit.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; + 431E20151A68906E00C2BD39 /* FlashlightKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 431E20101A68905E00C2BD39 /* FlashlightKit.framework */; }; 5F7BBDB71A5E490F00377A9F /* Icon.icns in Resources */ = {isa = PBXBuildFile; fileRef = 5F7BBDB61A5E490F00377A9F /* Icon.icns */; }; 5F7BBDB91A5E503E00377A9F /* bundle.icns in Resources */ = {isa = PBXBuildFile; fileRef = 5F7BBDB81A5E503E00377A9F /* bundle.icns */; }; 5F7BBDBB1A5E515800377A9F /* say.bundle in Resources */ = {isa = PBXBuildFile; fileRef = 5F7BBDBA1A5E515800377A9F /* say.bundle */; }; @@ -16,10 +16,30 @@ 5FC13D4B1A4CD847008A0FE3 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 5FC13D491A4CD847008A0FE3 /* MainMenu.xib */; }; 5FC13D571A4CD847008A0FE3 /* FlashlightToolTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 5FC13D561A4CD847008A0FE3 /* FlashlightToolTests.m */; }; 5FC13DE31A4E0705008A0FE3 /* WebKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5FC13DE21A4E0705008A0FE3 /* WebKit.framework */; }; - 5FC13E5E1A4E13B8008A0FE3 /* FlashlightKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5FC13E5D1A4E13B8008A0FE3 /* FlashlightKit.framework */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ + 431E200F1A68905E00C2BD39 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */; + proxyType = 2; + remoteGlobalIDString = 5FC13DEE1A4E12DB008A0FE3; + remoteInfo = FlashlightKit; + }; + 431E20111A68905E00C2BD39 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */; + proxyType = 2; + remoteGlobalIDString = 5FC13DF91A4E12DB008A0FE3; + remoteInfo = FlashlightKitTests; + }; + 431E20131A68906A00C2BD39 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */; + proxyType = 1; + remoteGlobalIDString = 5FC13DED1A4E12DB008A0FE3; + remoteInfo = FlashlightKit; + }; 5FC13D511A4CD847008A0FE3 /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = 5FC13D351A4CD847008A0FE3 /* Project object */; @@ -36,13 +56,13 @@ dstPath = ""; dstSubfolderSpec = 10; files = ( - 5F249B291A5CFCA3001FEF8F /* FlashlightKit.framework in CopyFiles */, ); runOnlyForDeploymentPostprocessing = 0; }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = FlashlightKit.xcodeproj; path = ../FlashlightKit/FlashlightKit.xcodeproj; sourceTree = ""; }; 5F7BBDB61A5E490F00377A9F /* Icon.icns */ = {isa = PBXFileReference; lastKnownFileType = image.icns; path = Icon.icns; sourceTree = ""; }; 5F7BBDB81A5E503E00377A9F /* bundle.icns */ = {isa = PBXFileReference; lastKnownFileType = image.icns; path = bundle.icns; sourceTree = ""; }; 5F7BBDBA1A5E515800377A9F /* say.bundle */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.plug-in"; path = say.bundle; sourceTree = ""; }; @@ -56,7 +76,6 @@ 5FC13D551A4CD847008A0FE3 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 5FC13D561A4CD847008A0FE3 /* FlashlightToolTests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = FlashlightToolTests.m; sourceTree = ""; }; 5FC13DE21A4E0705008A0FE3 /* WebKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = WebKit.framework; path = System/Library/Frameworks/WebKit.framework; sourceTree = SDKROOT; }; - 5FC13E5D1A4E13B8008A0FE3 /* FlashlightKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = FlashlightKit.framework; path = "../../../../../Library/Developer/Xcode/DerivedData/Flashlight-ecbbmgoifutwxzgzjgruzjyizltl/Build/Products/Debug/FlashlightKit.framework"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -64,7 +83,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 5FC13E5E1A4E13B8008A0FE3 /* FlashlightKit.framework in Frameworks */, + 431E20151A68906E00C2BD39 /* FlashlightKit.framework in Frameworks */, 5FC13DE31A4E0705008A0FE3 /* WebKit.framework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; @@ -79,10 +98,19 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ + 431E200B1A68905E00C2BD39 /* Products */ = { + isa = PBXGroup; + children = ( + 431E20101A68905E00C2BD39 /* FlashlightKit.framework */, + 431E20121A68905E00C2BD39 /* FlashlightKitTests.xctest */, + ); + name = Products; + sourceTree = ""; + }; 5FC13D341A4CD847008A0FE3 = { isa = PBXGroup; children = ( - 5FC13E5D1A4E13B8008A0FE3 /* FlashlightKit.framework */, + 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */, 5FC13DE21A4E0705008A0FE3 /* WebKit.framework */, 5FC13D3F1A4CD847008A0FE3 /* FlashlightTool */, 5FC13D531A4CD847008A0FE3 /* FlashlightToolTests */, @@ -154,6 +182,7 @@ buildRules = ( ); dependencies = ( + 431E20141A68906A00C2BD39 /* PBXTargetDependency */, ); name = FlashlightTool; productName = FlashlightTool; @@ -207,6 +236,12 @@ mainGroup = 5FC13D341A4CD847008A0FE3; productRefGroup = 5FC13D3E1A4CD847008A0FE3 /* Products */; projectDirPath = ""; + projectReferences = ( + { + ProductGroup = 431E200B1A68905E00C2BD39 /* Products */; + ProjectRef = 431E200A1A68905E00C2BD39 /* FlashlightKit.xcodeproj */; + }, + ); projectRoot = ""; targets = ( 5FC13D3C1A4CD847008A0FE3 /* FlashlightTool */, @@ -215,6 +250,23 @@ }; /* End PBXProject section */ +/* Begin PBXReferenceProxy section */ + 431E20101A68905E00C2BD39 /* FlashlightKit.framework */ = { + isa = PBXReferenceProxy; + fileType = wrapper.framework; + path = FlashlightKit.framework; + remoteRef = 431E200F1A68905E00C2BD39 /* PBXContainerItemProxy */; + sourceTree = BUILT_PRODUCTS_DIR; + }; + 431E20121A68905E00C2BD39 /* FlashlightKitTests.xctest */ = { + isa = PBXReferenceProxy; + fileType = wrapper.cfbundle; + path = FlashlightKitTests.xctest; + remoteRef = 431E20111A68905E00C2BD39 /* PBXContainerItemProxy */; + sourceTree = BUILT_PRODUCTS_DIR; + }; +/* End PBXReferenceProxy section */ + /* Begin PBXResourcesBuildPhase section */ 5FC13D3B1A4CD847008A0FE3 /* Resources */ = { isa = PBXResourcesBuildPhase; @@ -257,6 +309,11 @@ /* End PBXSourcesBuildPhase section */ /* Begin PBXTargetDependency section */ + 431E20141A68906A00C2BD39 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + name = FlashlightKit; + targetProxy = 431E20131A68906A00C2BD39 /* PBXContainerItemProxy */; + }; 5FC13D521A4CD847008A0FE3 /* PBXTargetDependency */ = { isa = PBXTargetDependency; target = 5FC13D3C1A4CD847008A0FE3 /* FlashlightTool */;