<%BANNER%>

Domain-Specific Knowledge-Based Informational Retrieval Model Using Knowledge Reduction

xml version 1.0 encoding UTF-8
REPORT xmlns http:www.fcla.edudlsmddaitss xmlns:xsi http:www.w3.org2001XMLSchema-instance xsi:schemaLocation http:www.fcla.edudlsmddaitssdaitssReport.xsd
INGEST IEID E20101123_AAAAEP INGEST_TIME 2010-11-24T02:24:10Z PACKAGE UFE0011560_00001
AGREEMENT_INFO ACCOUNT UF PROJECT UFDC
FILES
FILE SIZE 19040 DFID F20101123_AADBFT ORIGIN DEPOSITOR PATH yoon_c_Page_040.QC.jpg GLOBAL false PRESERVATION BIT MESSAGE_DIGEST ALGORITHM MD5
6be3f76b37c68fcea843c879ee364697
SHA-1
baf448ea881b69313836975b216161e23d20ca5d
61459 F20101123_AADAZZ yoon_c_Page_109.pro
0121024e8eac1df205a6fc9673662d29
e7569c91de5308039cacac21159b44215682a0e9
22018 F20101123_AADBGI yoon_c_Page_048.QC.jpg
94891a3098a1bb58cee4ed3502897d12
4944b429f3b25352e7454fc94ff1d9641deeff50
5374 F20101123_AADBFU yoon_c_Page_040thm.jpg
75fea8c25a380248a3da0bf5871fc148
7704a4605d6eb2285d49687a45f005dd300ac15b
6381 F20101123_AADBGJ yoon_c_Page_048thm.jpg
aa591af4d224ac83ced3ba4dacc2ef87
1d40d41679455f8e5f5a0e267a727ae624b5d6ee
15908 F20101123_AADBFV yoon_c_Page_041.QC.jpg
25983ce1bc356747fb4f2f7013e8ba43
4f09d1ff4fafe44c9b4c7be3b446a0b08a1f659d
19157 F20101123_AADBGK yoon_c_Page_049.QC.jpg
a8738c2cdc48d07d935215f85303f2f0
ce50bc4cfc1e169c86a0ad44261dc7984e2c0ec9
4783 F20101123_AADBFW yoon_c_Page_041thm.jpg
f69e7978dbcc2847e112262f6369d99c
4fa3023eb242d9673259ee4432d01e664566d67d
5630 F20101123_AADBGL yoon_c_Page_049thm.jpg
01c18f85965912589f4f541180d205ea
2a0a17ec0ef5ca45d81c6697efcd1be9775b059c
18582 F20101123_AADBFX yoon_c_Page_042.QC.jpg
d8f0002bc751cc66f01f89f3ec1a3ad9
137dd0ba93e6a112bfc7bbfc96689b253f8f6693
18531 F20101123_AADBHA yoon_c_Page_061.QC.jpg
e5b299b8d794f7ad8815f4f13e5b33ae
df6a03fb98ac5a97f8c756b910e4f1ca7f783876
6300 F20101123_AADBGM yoon_c_Page_050thm.jpg
c79bb0760ef7bf8fbc9db61d5fc41063
195efc1b463e1e3c4a736b746004abc8ea041cbe
5730 F20101123_AADBFY yoon_c_Page_042thm.jpg
ab239cdd60c32e821d40fb6b690c31e1
57c29fe2b48412d0a96945485048fbc1e9a2ae54
5220 F20101123_AADBHB yoon_c_Page_061thm.jpg
98139254c2b244378132117875beee66
12b696671dd6f1289034084504c88cb342aaefc0
18714 F20101123_AADBFZ yoon_c_Page_043.QC.jpg
342923091ba024b81b99fdde709754f4
611ab57cf04cdc52324fddeab54e2a61afb7d61b
19576 F20101123_AADBHC yoon_c_Page_062.QC.jpg
2c219d57f3b043237a5811e28ec00f1b
97581af71480fb5936998ef255c994caab06ec6e
5830 F20101123_AADBGN yoon_c_Page_052thm.jpg
1a07e8a52451b332d43d0325a7c1a5c0
9c1d37683c46734922b1741afcd8ecc6d0cf6148
5479 F20101123_AADBHD yoon_c_Page_062thm.jpg
5ba8e36c8d5babf61f6652a4d1fe9e9e
446c7ef498d8aa65eeb51e98ccb20315292f55fd
6380 F20101123_AADBGO yoon_c_Page_053.QC.jpg
e6661a9145b3846717a4ca07ef430987
da1825243425a2918b65a3347d30736901ed778b
4133 F20101123_AADBHE yoon_c_Page_063thm.jpg
1a3d21cb8cb34a705ecc0d75931752ca
c28c4c720fc052f5e0dd044a9c8ce3885bf5abdb
2277 F20101123_AADBGP yoon_c_Page_053thm.jpg
fde89f4f1a165f5a2da97d82a4ede4ca
24c12dcf8a66359179769b5d6ae7a1fb6608e504
22332 F20101123_AADBHF yoon_c_Page_064.QC.jpg
e6ceaef9b6c25292f6ee6779663ef698
ec5bd056dae596de4a97e9956f7c48760b337607
18733 F20101123_AADBGQ yoon_c_Page_054.QC.jpg
6b79f2a6be6dd84d054f1f0be71041c0
263d8ad2cf1244cd225f0d9330be8c4c61ba808b
6284 F20101123_AADBHG yoon_c_Page_064thm.jpg
c6f05fa569363596e1f4813c6d8e0009
a162d0c3dfd964069f1cd335297388dfda059e52
22929 F20101123_AADBGR yoon_c_Page_055.QC.jpg
6af7b22c09732ba5f2d82e9939acb2d1
e8717e18156110933c6950699f47ee7b2bec62a3
16675 F20101123_AADBHH yoon_c_Page_065.QC.jpg
50631b7a7d3a653086ad9c3cc32a0f78
e4a862062ce758eefb46901149860aa2b9fae708
20878 F20101123_AADBGS yoon_c_Page_056.QC.jpg
d260454a9cd0ddb9f783aef889fb70a2
af05a945dc29ffe57f5c1ed41933755359aa6661
4360 F20101123_AADBHI yoon_c_Page_066thm.jpg
c1ef67c1677f26f095bf79b0bc2161f6
57284ec4ac86c2f87f315e7eedbf36031e0cb7ad
6229 F20101123_AADBGT yoon_c_Page_056thm.jpg
1c02cd06f4692ad13f3818d5aba71888
e57cfe64a1e1034de3bc942643b1d9144bf4eadb
12225 F20101123_AADBHJ yoon_c_Page_067.QC.jpg
0e8ecdcfbc5f1ec23e3f358452deb4c7
cd6a083adab53a08b597109b124e9dc5c840fb2d
22534 F20101123_AADBGU yoon_c_Page_057.QC.jpg
9b18293495796c68ac6648ad3b5c3fbf
75e6acf32389c3feb761708c6a11efe16f2c0424
3508 F20101123_AADBHK yoon_c_Page_067thm.jpg
370ad3b1a91036c3c7040336619acebb
baf6f7ac4dfbb53abfe657155bb193254b751fc2
6445 F20101123_AADBGV yoon_c_Page_057thm.jpg
42be85332b3392e9e6e2d162bb6bfad3
b7c02b79cce626f031d9e80d55c7c3a83394a6e2
21037 F20101123_AADBHL yoon_c_Page_068.QC.jpg
75b39adc1f47fb33a9ef454871d7b5fd
b744b13ddc2ce825b49a7ede274300c74782d8e3
6610 F20101123_AADBGW yoon_c_Page_058thm.jpg
f16fc818e66a47440f46e562637d0337
2fa3bda8f67c2b8d618457aa7d96c29cb713b37d
4638 F20101123_AADBIA yoon_c_Page_077thm.jpg
d15156d19e1ded2b8592a928db3518bc
4dea2d8a90967dfbd91cad5ca5bd8147c5e993e0
19252 F20101123_AADBHM yoon_c_Page_069.QC.jpg
505ac44a72406348ca2bd41342f1bbf2
d966f58b2c5e7f2d92bf9cc99e160e93886143ce
22990 F20101123_AADBGX yoon_c_Page_059.QC.jpg
99609298899e48427736e2c6320e0239
4ca4473d6320721996baa17e908b8f2720708713
22144 F20101123_AADBIB yoon_c_Page_078.QC.jpg
1dec2bee3526a69b6cf46f1428a462bf
faeda4dc41bc464df74297ceb5578b5f917da584
5760 F20101123_AADBHN yoon_c_Page_069thm.jpg
b49db1141daff5b46b7baebd88a43007
922940ba2af48fd587d8294ab3932fe8d02bcebe
13301 F20101123_AADBGY yoon_c_Page_060.QC.jpg
9f6ab4e7cdbcc5ecdc465336757b3223
21904e2ab33caffb948da211f61fdbf4776aabdc
6029 F20101123_AADBIC yoon_c_Page_078thm.jpg
5c007b6e6a3cd0ac010a353b4b75a737
807eeb8bcc7c0be903fcb69c2866989c4a8dbddc
3867 F20101123_AADBGZ yoon_c_Page_060thm.jpg
2ea75faed00bbbe4b67c9a4f3c34d43e
e1b4a135afaa3c726bb69346da6e5c7186a6b969
18284 F20101123_AADBID yoon_c_Page_079.QC.jpg
348e84767c8e2815a9424306d8f83bf6
792c367a365a24fd4ddc3c5c44e0d6bdcb0f748f
20108 F20101123_AADBHO yoon_c_Page_070.QC.jpg
b8512a76f83547fd709faefaf75c0c5f
c4c371546393c1d485bc405b4f951417c38d249a
5459 F20101123_AADBIE yoon_c_Page_079thm.jpg
a8e0621a7eed50240f1aad59eb440914
f1462498916b9903b9a1d84b918821d8403cbd6c
5883 F20101123_AADBHP yoon_c_Page_070thm.jpg
ee72e65885d6b0e2474993d7770f22cf
83776b361111cc8fbf7a5784a78fc40bdf8ea72a
18546 F20101123_AADBIF yoon_c_Page_080.QC.jpg
da099ad28e44b15533d18d2279c6472f
ca3484bb9b8a48d8180ce9f4a28d61f7f8c57f66
5981 F20101123_AADBHQ yoon_c_Page_071thm.jpg
c31ce4b23ab2339bc8d77478684f8cf6
ac0f4c1c755ce4fa5307d611b035e2a7bd195642
5084 F20101123_AADBIG yoon_c_Page_080thm.jpg
d54d6a0515aeef8c79a5a5d2e9a9f17f
5cd95fd7d2e158296900362223ec998a3d80eecd
9326 F20101123_AADBHR yoon_c_Page_072.QC.jpg
f7ce0bcd98879389c5aa2746e80bb282
2db165482d4f304e7f9bfe80e408d32ca9f87f8f
19689 F20101123_AADBIH yoon_c_Page_081.QC.jpg
24f522acac85889f200a955bca15818b
a80dc2c818c8a8cfa2399dbde3c9485df602b044
3007 F20101123_AADBHS yoon_c_Page_072thm.jpg
2cc9ad9191c295b72eb9ac3a8e923cba
a46f614ce235fb7eaf6b3cc2763e0689aff5d48a
5773 F20101123_AADBII yoon_c_Page_081thm.jpg
b856f4db074713507d029ac4159d5b5b
089a7abdbccf347727aec19dec36453b25dba980
18097 F20101123_AADBHT yoon_c_Page_073.QC.jpg
4148035d0d4e66bcd021eaba60517e47
5bd86ed1ee2e9951a0733081db2d6860fde488f3
20388 F20101123_AADBIJ yoon_c_Page_082.QC.jpg
a05d07979188542432c32b084cb2ec7c
22ffb5b8fc98cf7c6f0ceb50b7476c2b4f1cd8ef
18178 F20101123_AADBHU yoon_c_Page_074.QC.jpg
39a428121b0b313e175f1135d5155986
38c03c0135d5310fdcd7ad7cd3e9815822d00388
5661 F20101123_AADBIK yoon_c_Page_082thm.jpg
b2def8e32a663a2be433955758a51c35
4d6b8691e659a274e2745a98b61a82fa04ea21be
5379 F20101123_AADBHV yoon_c_Page_074thm.jpg
e305acf8a82d91908e9cd83b8574d853
585a6aa66fa2f86298d4c712db9f4fb26c3d2e86
5338 F20101123_AADBIL yoon_c_Page_083thm.jpg
aca7c81bd842ae0c12409c75ff445061
56571490b1b086a971fd3fcb5d892385198d8484
16528 F20101123_AADBHW yoon_c_Page_075.QC.jpg
6cf8948b8937bbfca99bd1ab92197e09
fb71b097a3f92aeb4f748a97fba9026f6d4fa908
14061 F20101123_AADBIM yoon_c_Page_084.QC.jpg
a07d730307835fd861982296a4850218
c9a85b3d16186e97a368e8a6e627d63fceea468c
4699 F20101123_AADBHX yoon_c_Page_075thm.jpg
8e50603ffb169a7b962d6937b299f950
0ac13b30170241ad68c65fc2ec51cc57646a4d46
20315 F20101123_AADBJA yoon_c_Page_094.QC.jpg
05114b7c602081d2cde80659a6894c29
ab67f4cd75027b08044c5522342e73f59b78310c
17016 F20101123_AADBHY yoon_c_Page_076.QC.jpg
2d2551bab04d4d75d45e4482bfab3129
ab6ab7fab141816544108e7395940d18e02ff974
5906 F20101123_AADBJB yoon_c_Page_094thm.jpg
4637feeb60eff99e25a2dd48c0ecb3eb
deed89e6157393a92b6c5f5b85790670ad9907d3
4368 F20101123_AADBIN yoon_c_Page_084thm.jpg
d3d2f4d94602368b330a858a423110f6
446507e8ec657a07e5d6cb287be09ae017d6bd46
4871 F20101123_AADBHZ yoon_c_Page_076thm.jpg
d30e35f3f77b0f1c4448150ffcf1500e
ddb6ff6dd4d67aa9b4b25d0b4602e182dd93ef71
14305 F20101123_AADBJC yoon_c_Page_095.QC.jpg
89d8a8277366a4139614856eb0bb73d0
b4e8c9ef862d52c04cb17d66f1cc0fff395efa49
18551 F20101123_AADBIO yoon_c_Page_085.QC.jpg
b2119877d3bc3fab0594a0003e9899b0
08ec4b3f790f9e76ea6e414c3ffe09648269dcd4
4824 F20101123_AADBJD yoon_c_Page_095thm.jpg
60dbc3421ef2c728d0e6adbb094a7c0c
bf79880f9ddd6e47d35a8e190b910e41195f3c01
27846 F20101123_AADAGC yoon_c_Page_072.jpg
deb938de35d293c2be551d83bd1c63c8
f34de6bec77f9e3b22902e15bc2a6ecf1c2835dc
23136 F20101123_AADBJE yoon_c_Page_096.QC.jpg
b1dfc7cf76bebb0a284d80fb1a48b0f1
ccdc8828e17aa852df072d07f48b206e11e45da3
5485 F20101123_AADBIP yoon_c_Page_085thm.jpg
5677aa678463a23b073b7846e6ba869f
37069dc59454014aa685d34c1ef2a98c97428b62
84426 F20101123_AADAGD yoon_c_Page_105.jp2
5a4601e212ec9d58058040b26fc8841c
523d59e9b1ebbdee4ad3469b9489da286bf1ecbe
6492 F20101123_AADBJF yoon_c_Page_096thm.jpg
601094c8080df605837a73f7e00cf9e2
1c881fd338cfe55124ad81d1278f5daf3e0a5bf5
6046 F20101123_AADBIQ yoon_c_Page_086thm.jpg
9e0a91e5e6fa15fe66d70b023bfe3498
f9ff6e66976aeb66ee70e46e2af946134e5dbd2f
39135 F20101123_AADAGE yoon_c_Page_013.pro
1fcf274de3b79a91bd82278b286faec8
0a08d7ac7addcc3947b6c19ae057ac7402ba06ee
22575 F20101123_AADBJG yoon_c_Page_097.QC.jpg
ecd881131a8975dba4946a653acc1f6f
8f0b1a33b132fe72c05897ba65ebfbfe65f7d90a
14041 F20101123_AADBIR yoon_c_Page_087.QC.jpg
e04c7ac870a311bf029f0cccd922b56e
dac1de30ff2ef14cbd903025bd66dc29fd6959a7
38652 F20101123_AADAGF yoon_c_Page_098.pro
337f98286d32d4f6aff20208d8ce95c3
006c2575501638a5560f26e7324b2454e0166669
6526 F20101123_AADBJH yoon_c_Page_097thm.jpg
6372c0bcc439490b803ee0524bd7332c
041882ed7725cbbc58f466a82a5e276d1108a947
13726 F20101123_AADBIS yoon_c_Page_088.QC.jpg
d9d484d79b1d83a94bfb618f6de47445
610f03e80ccade1c005323564da58776855917a9
1917 F20101123_AADAGG yoon_c_Page_023.txt
ab18c511839271f61d79fba94be2e3b3
60cd015750932025857b3cfb81741abc2d8d30db
18848 F20101123_AADBJI yoon_c_Page_098.QC.jpg
efee49b1f2669eb5c3aa38fa6619e65e
ece5808ce6a769530566cf2ed5d19dc1b6b92912
4364 F20101123_AADBIT yoon_c_Page_088thm.jpg
65631e944e2c96b9c6ca8b36f13fb5ee
fc28e014e626572404fea41529d38a30654087e5
1053954 F20101123_AADAGH yoon_c_Page_100.tif
01b4e15115d22686fbe52c3742f38b1f
c7c65561d62d7554418efa42dcb59a2131b5f5f0
5483 F20101123_AADBJJ yoon_c_Page_098thm.jpg
89b7ee6de110ceaa7d42f8f6c7e8b935
62a32a98310803344f03dda5fcefb3976d13a1e7
20551 F20101123_AADBIU yoon_c_Page_089.QC.jpg
441f2ce8c811bc1647fe75f8062a82c9
6365a13e6f0fdee83fb55a22c7809426c65d4d45
68821 F20101123_AADAGI yoon_c_Page_077.jp2
8f4c11461aa37863ef756eae33141a68
1159416f2ef862995fb010e1247778a9058cdd99
12287 F20101123_AADBJK yoon_c_Page_099.QC.jpg
37eed63f2737003b7ec47c9b8fa7601a
ed4a63c1fff59575ded11c5fa53e45044bfdc36d
13794 F20101123_AADBIV yoon_c_Page_091.QC.jpg
450cfa8df36e78bd288c8aa728542b69
83dc0055c0e4d55bf1f9a725f0618e1786131850
68523 F20101123_AADAGJ yoon_c_Page_059.jpg
9111961a887b0377f7c66288d203c9d0
808321b6340f674d171b8391fb1417447e4a8e69
14517 F20101123_AADBJL yoon_c_Page_100.QC.jpg
7df3c1fcdf9ce346829a16da7d64a148
8b3a9eff3a95b81cab52d272c0a8b472fb833634
4454 F20101123_AADBIW yoon_c_Page_091thm.jpg
094a19a058fe9a2af187c9775ce9e3f6
68924cb191420306444a401c42e93c65c331bbdf
6955 F20101123_AADBKA yoon_c_Page_109thm.jpg
aea67f54e7474802e91869c534082a04
cfff4a58615ff2f0f3a230e40e8266f236d8791d
1803 F20101123_AADAGK yoon_c_Page_082.txt
af2c0b8e38881354cb889621286e0b2e
f6e37c5f9b97bb874c8dd4094289debf27102b0d
3888 F20101123_AADBJM yoon_c_Page_100thm.jpg
1ef0d32a7028ddd88339488597f475e2
1728daf0089025033d5ef24540b36d11f97f6355
16560 F20101123_AADBIX yoon_c_Page_092.QC.jpg
de30a18087cc40fcf12121dfa78e5280
974c7f4b14ffcbdd866c29a71478a7a2c362a3d2
2923 F20101123_AADBKB yoon_c_Page_110thm.jpg
5c14f6f8c475e0720cb6170c686fb6f1
b02e9e671d25f871a480ebaa016aad060b315de3
1575 F20101123_AADAGL yoon_c_Page_042.txt
eb9963f251840e2f8a7022a33c5ecb75
a5b3ca51e107e711e935ceeaa02788ae125a8b65
16481 F20101123_AADBJN yoon_c_Page_101.QC.jpg
5e43487f48929949de3bb984e8b71dff
1f379f04dd5ba5bcde1834593ce6a3d2a92595e3
5141 F20101123_AADBIY yoon_c_Page_092thm.jpg
0e7ee05eba64331bc0eae0f5dfb5b538
8c5df3f4557496259018652c9c97ad87f6c805c8
5687 F20101123_AADAHA yoon_c_Page_012.QC.jpg
bcaac4ec227b60f0c04ee5aaf2b3e870
8eb90b542dbb95482c23464e5db017744e692f0a
9629 F20101123_AADBKC yoon_c_Page_111.QC.jpg
b15bcd7469ddd25ef498ee876375a8c0
e53045baaa3c7bc783f82c2e439227697da47aa8
65241 F20101123_AADAGM yoon_c_Page_009.jpg
c3fd79e1cbcbad69ee31d4a3b050eb69
a13240ec64a887ca913dc27daf898d582a5fa87a
13054 F20101123_AADBJO yoon_c_Page_102.QC.jpg
560c6fa70c32aa39b92bf932ad39ef6b
81bdda840bb9b15fe67cfd4bd470d633b77c15c6
3795 F20101123_AADBIZ yoon_c_Page_093thm.jpg
155fe70ad65e85418594cd38455756c6
a6debff216615fc41f588f98fb12384884d91482
1516 F20101123_AADAHB yoon_c_Page_077.txt
b6fe96b078b99f1c691787810b0ca67c
fafb9bd5db2bc5594d39ad71f745964eb764bcba
127780 F20101123_AADBKD UFE0011560_00001.mets FULL
6b86232b44239a697b47068301253185
b2dfa295b3228355fdf0e335e85707101cc04b70
1051957 F20101123_AADAGN yoon_c_Page_056.jp2
1ab035a5dacbef2fb50af7d95cdf710c
4ac17cc32ad3e94867693c520956446f2da71bca
3698 F20101123_AADBJP yoon_c_Page_103thm.jpg
31251d6aecd1471c8130623eaa740a25
0db8e58eb429de94274806b993fec0efcb463996
F20101123_AADAHC yoon_c_Page_029.tif
026e08d8a295eeb76b7678952d646dcb
66beeb7bf176d787faf21a6f4ecb6d58a48db559
33251 F20101123_AADAHD yoon_c_Page_043.pro
017d2dbf79e7ae691979c96470486025
fcb0e2ddce8b43840628a0afc850afb1bfd54c5f
19478 F20101123_AADAGO yoon_c_Page_016.QC.jpg
dbbbc1e792e83d6dc3f995d495de2900
d9eb3d2c7619ae02fb147b9e61c90ae8899a3498
13677 F20101123_AADBJQ yoon_c_Page_104.QC.jpg
64881cf7a1b57e2b5a67b18c0627101e
7681ce11762e54cf374150b689d73277c6f8c181
62405 F20101123_AADAHE yoon_c_Page_101.jpg
48c28b54ea5819b31e23d88ddc32b065
1dde83373c4406421f74181ff3bd9d5086b94f50
21380 F20101123_AADAGP yoon_c_Page_015.QC.jpg
04085238b8db176c01391a6caa46ae16
005f3f50815eadb6eede137817e8c8df8f6a91e9
3711 F20101123_AADBJR yoon_c_Page_104thm.jpg
2e569c0b150007f3d7b7873cc5f837e1
46c6e72ef2f592069c75049e369605d74b936c93
42320 F20101123_AADAHF yoon_c_Page_070.pro
8ca27ac36c10a9fc992af9dc571ddec0
79f3b5b70587ab68c989f3e8579b6699833ad8cb
F20101123_AADAGQ yoon_c_Page_059.tif
57aeb9fc2cad0f9b6a4c1a58da86999a
586e0d83228f8740f200ba8f2cafa0fac0cc7dae
15932 F20101123_AADBJS yoon_c_Page_105.QC.jpg
e112312b543caf37d5b200123f3b3173
444e1dcfd502e5f566e689d88b7dfb143fd01e30
71150 F20101123_AADAHG yoon_c_Page_103.jp2
65e0b9b7e18a88ea1a084fba1ad1ea9e
1835d78391f19ab070292ab892de12019d34d3cc
47028 F20101123_AADAGR yoon_c_Page_019.pro
d8d444befaf7b397339ed43f29cec09c
2256104ff0b2507b60d3bf9e760f9603b9840d2c
4166 F20101123_AADBJT yoon_c_Page_105thm.jpg
92ca2bde98a50ca75957c1b8a0c81e44
ccf9835ed02fc133a6642b7acd3a94a43b342545
101433 F20101123_AADAHH yoon_c_Page_078.jp2
1ce2494abfb0d9c7fec3d1aed73c9d22
2573f4eba7a2dd6ba70ac3f01ff7c8f4fb6adc6b
681042 F20101123_AADAGS yoon_c_Page_095.jp2
ec48009a129bd5ad2b5a0a430c702f9c
9bbc4fd15867e6623c8625b3bbccc41181466d95
22113 F20101123_AADBJU yoon_c_Page_106.QC.jpg
7acee233a8f470f849810d42f227fbac
5bfa5fb1374c0978c70201106f8aef9f3f738b28
F20101123_AADAHI yoon_c_Page_106.tif
6d40cad40d50d8a9f66925b6fdeca2ca
cbe2e1ef963ba2eb961780e4ed99ab0e36393352
87515 F20101123_AADAGT yoon_c_Page_022.jp2
215d845f1276feb2cc35e6c10eaf616e
98f8f058db7e51646a8902c127d47b75a730a351
6064 F20101123_AADBJV yoon_c_Page_106thm.jpg
885e970d3a6086b97a706f3645ce4dd4
efa399e649cf210b623875eef89e679d7dfb9e11
F20101123_AADAHJ yoon_c_Page_036.tif
154ab0be07dc64db6bd585a29fe3b568
538e07e1e002f5e26183025ab1e600c8967120a6
10139 F20101123_AADAGU yoon_c_Page_002.jpg
e01e400ad265297cca8874caaf34c89a
2078a910a47de8ba5b884bfbf090c64638edc363
6985 F20101123_AADBJW yoon_c_Page_107thm.jpg
9bb6b13804b66a1a59b74aee388fb5da
5dba698267378420d7cf09715e64fd5661c05e8b
F20101123_AADAHK yoon_c_Page_030.tif
2fc2158d52e517487a7c0f04a93264f8
fdc1bdb06200726a284d0d6bc4afc29f90498a68
83353 F20101123_AADAGV yoon_c_Page_108.jpg
e79f89f5f1556e630253d8c5d8256fe6
13de5fbacb9d774578359cfb29b4108af529bc74
24979 F20101123_AADBJX yoon_c_Page_108.QC.jpg
0443b5d60fda264ca76a8155b9e35983
a113b527fe150e8c00048e6e16363e439d1eea20
1764 F20101123_AADAHL yoon_c_Page_015.txt
667d91093f8cb3c2ddad17f5122372a2
0ee2f1662c34024fb4777a3156606343c1f2bc18
46522 F20101123_AADAGW yoon_c_Page_105.pro
760e5c1494540a8e5c02043df09c4b2f
ed0aa8a53e72369977d2ee63fc10ca9a90c90573
6924 F20101123_AADBJY yoon_c_Page_108thm.jpg
ace8c318c7ecec8482fa15b216c6ecc9
97a93ed469e371069375307a6b3453e4ff10497d
25271604 F20101123_AADAHM yoon_c_Page_075.tif
c103cf4a5743bdc6e35608a0e3dc291a
17ba53c6ecd82f5df26874cd80185fa7aa4d54c2
3442 F20101123_AADAGX yoon_c_Page_004thm.jpg
294bdc70fa24f34270331ad7acf4ddca
882862debb37a9cdaca68d8071f303dc6a4ea881
26007 F20101123_AADBJZ yoon_c_Page_109.QC.jpg
c0b443edd0464773e040ff1d0dda8dba
2c14ffe9e01b6c8e76122aea4e9905d5828a7457
57172 F20101123_AADAIA yoon_c_Page_062.jpg
7195d9ebf250c6f944c65a3a3a493e38
dc9274267bfb5364703b32b6a36deb3885d4c37e
4966 F20101123_AADAHN yoon_c_Page_065thm.jpg
61d1ef822cb446d1841befbf2b516b7c
46fcd2db05bcd8c3106e0aff279e4b806d547d9d
4321 F20101123_AADAGY yoon_c_Page_101thm.jpg
333f09e0a15a26f7ef2b9d7b486860ac
3e9f8b68d0a859d8c798e090473ac02bc9f89b88
1015190 F20101123_AADAIB yoon_c_Page_046.jp2
f0e7111284c15602c955811c7bd85a3d
153248075474879856c1e82914d80744e67326fe
F20101123_AADAHO yoon_c_Page_009.tif
af3158b44669242ad9e0e0e113e1355a
07589e4f0f5a804f53e245fa6cc79598810b68ea
709040 F20101123_AADAGZ yoon_c_Page_045.jp2
7ad6f998ce32fdf1eaf9b7d319771128
b97c176f605b1e7c91b5d3ce31e2e241b4bc0783
F20101123_AADAIC yoon_c_Page_066.tif
2dd8fcf055e95c1220f7557c045bfd1a
5389a3682f761d4b9f827fc6a0dd9652e1dbae69
61489 F20101123_AADAID yoon_c_Page_028.jpg
b5fe05a6b2ad93f5d1a9ba9dc57984c9
922b0c722bc739504d4db865c3ce0caa1370a75d
26443 F20101123_AADAHP yoon_c_Page_084.pro
9cf07ce667a32e7efbbef8189f83ad6d
e6576f7c73c9717fb4fd5775f4d2e03f5d8131bd
22527 F20101123_AADAIE yoon_c_Page_023.QC.jpg
72712e6368e765cbd9c0a0114102a6d7
62d0ca7e49bdc64d7ad90f451611623b82ec27d7
23165 F20101123_AADAHQ yoon_c_Page_035.QC.jpg
83a80edd815f08eb1f7953357695f085
44ebcdf9ff819a40fbee50c9b3b016d4cfc13e40
12861 F20101123_AADAIF yoon_c_Page_063.QC.jpg
d60c9aff07ce1c68fe12b68d3d83e7f9
b19130e17544d3508218423bd09779ce1c1db9b6
24255 F20101123_AADAHR yoon_c_Page_014.QC.jpg
4fc8174992ef6c8e9a98832fc3b321ee
b738cb74fb08d24517ce68812c5097d88cafe490
47009 F20101123_AADAIG yoon_c_Page_055.pro
56e1824b5b7dce90b4debd03041c95ec
f7732d678cad7a59fb4f7568752ba862730d0cc9
47935 F20101123_AADAHS yoon_c_Page_035.pro
b8b52f1e8a49c643352b0dc5faf8df65
53479156c5301ccce42426e8e19752de844fd2b0
14527 F20101123_AADAIH yoon_c_Page_066.QC.jpg
2875fc515dd53400b67804b03e81b598
be9ff4b5cbee87b55542ebf3081f3ec4adc69b72
34538 F20101123_AADAHT yoon_c_Page_083.pro
0c1481969120ade6f2218a00b0622c6d
2dcd2b14fcc656fdbcbfba9414c982b58c9c5c5d
100753 F20101123_AADAII yoon_c_Page_039.jp2
6252859e30226c5e0cd60477c9ecaa9f
a36246651e564c0de9eff85508aaf40406575485
F20101123_AADAHU yoon_c_Page_040.tif
fe19f4e659e3e7bd558c37bca7d50f13
31ae7c5ea2226ccb411f4ed9f1e75a1fcd3edd40
46946 F20101123_AADAIJ yoon_c_Page_031.pro
053d1b62532027e93c5fbc8164b28d49
e498fb79a89e6df528bd2eda7018c95acf53a6a0
59570 F20101123_AADAHV yoon_c_Page_108.pro
f13ffbd0a8f655e28c00e8a8ad26d1b2
9bc591704f21656bec58d5808e2451ea73ce5cd0
59898 F20101123_AADAIK yoon_c_Page_042.jpg
9ec769166f2da00a8565999ecc416478
f6f40dbb8ac5e873a46a3d8cd02d33c72f8d82e1
F20101123_AADAHW yoon_c_Page_092.tif
8524ff85cfb55d1ed22903e5499f4fa3
c1a628a5d8bd03c565bfd2e224068de2471e3d93
F20101123_AADAIL yoon_c_Page_026.tif
6a76e3941e2bf4ba06d175041e6147cc
7af37e9c33063a95115defc632d538c271f9b136
4923 F20101123_AADAHX yoon_c_Page_029thm.jpg
0ed100ea34cd454028aab873af5cf5e7
3d6385f1c0a31b114732f0b591cbb6cb4abee88a
21629 F20101123_AADAJA yoon_c_Page_044.QC.jpg
1dbeec7d73ff0c2f3d6dc0cc0ea8d203
e6f48dec73e1ec2885b2469f5fe31c16588fe3bf
20812 F20101123_AADAIM yoon_c_Page_052.QC.jpg
c0fae968cffa2e9bfcc9ab4c01d8ac50
1b7aebbaefdc3e1a42bcd9f47b5c24c852d95e1b
49299 F20101123_AADAHY yoon_c_Page_101.pro
ee4429f5ec1f79e3dff4bc684d986478
0141b50e1717eb9f55837f9b10459146c1ebd3d9
23486 F20101123_AADAJB yoon_c_Page_060.pro
a64f465fd47779f1fa6ab74959b84094
900a3987e4af5f727d2f94089c788b5421977e37
105670 F20101123_AADAIN yoon_c_Page_025.jp2
14445f3a1e08cb989dc0e008457df2d6
baeb206fa391bc2f1956eaff6917d37f5c11a047
96057 F20101123_AADAHZ yoon_c_Page_071.jp2
6b1a0cdaa5cd6888d543d24e86473492
271bae5d9b15b21b18462e58c3366cd3d4381f27
F20101123_AADAJC yoon_c_Page_045.tif
a90782fff4a0653900e4238597ffaab2
88edbad86160b6bd31ad3870f3e84b71c877b93a
36522 F20101123_AADAIO yoon_c_Page_011.pro
a56031db69bd58cfdb1c4dd9130f9f77
e8a3e408d88db2d8c93505eada760eed1e13f559
79696 F20101123_AADAJD yoon_c_Page_017.jp2
9b80e50e5ebd4c6b3d0456b242069aca
ee7f9c9c6fb0a933260e04c456074339d46ef63d
134433 F20101123_AADAIP yoon_c_Page_109.jp2
b7caac37c0d015aa081a1f1c05e43149
05dc5cd0af21997799ac0e51959b428ac3074e16
2106 F20101123_AADAJE yoon_c_Page_106.txt
859496cbf5c7a15c7844c833de78ce9b
9e764decc743a5abdb30cd1d6e75ea00a89ca085
F20101123_AADAJF yoon_c_Page_021.tif
1be1fb1660c5f2d465bf790e14fea4e2
cba1763be9a224765e4efc2535b78e446e4839f8
F20101123_AADAIQ yoon_c_Page_094.tif
3a673e934b2f354db238df67d8913a0e
11fff55edf4beb039fbd925f384c57b32a142583
54598 F20101123_AADAJG yoon_c_Page_020.jpg
51731644b62e79fc01b0aa2713c46dca
6216ed49485a3b32f2042b600336ea2bae3b383b
4523 F20101123_AADAIR yoon_c_Page_087thm.jpg
e60c7cf0461db3f7a2498c3e99a5dc01
a70a55ec4f93efa19131460b98a521ea5e9b6584
59541 F20101123_AADAJH yoon_c_Page_037.jpg
8643e48c8a8bc5b1b46476114f40bbc8
d5971ce4ae2d6a6a15f18454db2675ad1b917cef
20055 F20101123_AADAIS yoon_c_Page_018.QC.jpg
1105da50727bf4285f02f21c06f45206
05ed9e4cea3822b39c5bc7db4acb255fac209414
F20101123_AADAJI yoon_c_Page_056.tif
35383b798aada995c07f81d189705868
46d12cc4409aa62c9ec600631ab07232e19b6963
F20101123_AADAIT yoon_c_Page_062.tif
2b7e9daf189a5ccadbf557b76cf1fb9f
0fbd59ed57e8630febe0147e5c44fdf303f01af8
528778 F20101123_AADAJJ yoon_c_Page_088.jp2
fac4aea0d4af2f38c427ec5b2e4f6596
fba96d0459e112059e4649e40cd550f51c4e97d4
42200 F20101123_AADAIU yoon_c_Page_110.jp2
4f9b49d65b7666d166d66816106f5756
a6c076b11e0aca555db7b96fbcd685867384b0f5
681847 F20101123_AADAJK yoon_c_Page_075.jp2
0f29fa44c4ca430bd87518ec10d9a78a
664bd1f6fc86731f723165d3d76187824e47f180
41498 F20101123_AADAIV yoon_c_Page_082.pro
e30d307124e55f50f5d741b17f93b2b1
c6f7e4458b7c735be1d66395e80a94312104a48f
838485 F20101123_AADAJL yoon_c_Page_043.jp2
07298961f8e61cb3f917e6f27ac1c1b7
42f956958f1defaf707e683c7072f765067d91ae
48547 F20101123_AADAIW yoon_c_Page_023.pro
f1ca8fdc845c6e4d97b1e27a8779b71d
faba6e7277aeb44f5795d64618925ba0f56aafd8
104288 F20101123_AADAJM yoon_c_Page_023.jp2
a1288dbf25a7c616f4afae825086d0a8
9e566159883b7fd00a19be358e6bb1d945993b4f
67857 F20101123_AADAIX yoon_c_Page_044.jpg
3c1df0cb28313d9950b55f81e02f5bca
4f2e50c2af99b224a1178d836c104b9e4e16b982
48266 F20101123_AADAKA yoon_c_Page_077.jpg
77d4c6f9e4d897f8e66c750f0f463b77
12fabc98be2da7371eb718c8942b0a0cdab2c312
41476 F20101123_AADAJN yoon_c_Page_094.pro
ca1e467363e7d3ca88f38e8b163ec62a
5b44827a4857167231c345d8f1cd4fd9cf55c9de
64598 F20101123_AADAIY yoon_c_Page_052.jpg
3b32bed52abc1860ad272ff58f56f27f
5891c41440c1f6ad7d3eaa69b5778f85ebb2c33d
3721 F20101123_AADAKB yoon_c_Page_102thm.jpg
a29ee11db870a97054aced38ffb37859
5f27fee809e75aaaa2437f41cd5d3cd40793d798
18237 F20101123_AADAJO yoon_c_Page_020.QC.jpg
031a9b9a76ddc866c2fed3f00740dd5f
5c9e161b8a4edae1974dc846ba583a4226f83535
59310 F20101123_AADAIZ yoon_c_Page_063.jp2
29d37c82877100f81828b1b9b02bcdc6
6beeddec1610b35dea4eef31dd3e08f20287a0c6
5874 F20101123_AADAKC yoon_c_Page_068thm.jpg
5d55836fe34f3a8ccb68b481d24e85f1
dbecd84fe514d7661f070c6b87d2948b3b6591c3
638323 F20101123_AADAJP yoon_c_Page_087.jp2
ebabc4b6f6a6e959e7f51862a8b65b2d
50328a8616f47672354f0e8d99a2b8aab7623b30
103805 F20101123_AADAKD yoon_c_Page_064.jp2
999207d68caa9cba2687e3759c88873b
3812c2667662bde21622d6fdb57115669ccfbe00
29258 F20101123_AADAJQ yoon_c_Page_077.pro
d0654dc48cd7461304e305a6b5b0fbdb
eae14c9000196ee7d801d7531c6064aac40f89b7
3289 F20101123_AADAKE yoon_c_Page_002.QC.jpg
677824be0cf99c068d3431b9fbf06a55
b73f056ede61f1397626abe223b8ad53f69db4f6
39792 F20101123_AADAKF yoon_c_Page_060.jpg
5bdbc7042b0033e9012cc2e31dd1578f
428b7bd72b0aecfd136097edafe2ae9dc275fa14
45793 F20101123_AADAJR yoon_c_Page_078.pro
95b1046d672fdbdbb17e7c02e7045363
30941978797cb0fc00031967b8c5b6f1352aa1e1
1051984 F20101123_AADAKG yoon_c_Page_008.jp2
65d5e43ca7219954b1096c5efb2d641a
f3fd791c1504386e1884913d4f28188b1c8b773a
9384 F20101123_AADAJS yoon_c_Page_072.pro
e5feb77e1df591797326945d7f1a9404
f2a1137fc1b9ce467ed250f67e139e468e5aaca8
108050 F20101123_AADAKH yoon_c_Page_034.jp2
69be49e69843e86cde87ffe4110931e4
854f7bf833ea53dbf24606b40cb47052f9865996
29586 F20101123_AADAJT yoon_c_Page_045.pro
089978d1d6fc48757bf5f85f1c462376
609f6c09da84acefd604c4e076e6f41e53ccf2bb
20575 F20101123_AADAKI yoon_c_Page_051.QC.jpg
8e99579a65d59608379e71af6a7e7771
53dd371ad8212fb5cf29f87e6913fae15bf4de8b
5489 F20101123_AADAJU yoon_c_Page_054thm.jpg
a3cc73f0af3bc5317d14351c60d0640d
3a0eb8f129ea3e16d54520927a0d79a8cf60663e
F20101123_AADAKJ yoon_c_Page_089.tif
73c660172548d6af808cc378201a6102
a3e232a3a31003bc9aad2948a7c60f17d7ce7394
6451 F20101123_AADAJV yoon_c_Page_006thm.jpg
10f3ef23d271de6c27d00c741ca4e242
513afc246e6cdc4abb8e68cc96799a96bfb137df
50946 F20101123_AADAKK yoon_c_Page_103.jpg
0e23b6f5e7f83d59c3222428ae33f28e
f9f078db7af397b02a5cbd9e33ef5e97e4286500
64693 F20101123_AADAJW yoon_c_Page_038.jpg
6f00008742ba589f387f88acf756de3e
3999a6194cc731473cf407a8e81982d92cb8431b
1944 F20101123_AADAKL yoon_c_Page_057.txt
12b588b625bd3130f847a4152dcb1b1a
c612007da7f597795e22b31d06e09c52cde80add
764 F20101123_AADAJX yoon_c_Page_110.txt
e35169c4574b521f88bbe8de399b9433
6fe4ea37080a7c3a5cca848bd28b92d125d394f7
5286 F20101123_AADALA yoon_c_Page_073thm.jpg
0bc05e96f511c6c638c90f17860009ef
98d8f4963a867e912d5afaaf3f5a5a84a2fa0b03
66739 F20101123_AADAKM yoon_c_Page_056.jpg
827ec266b01063f27baed4c899634b1f
99d1ee13aee253e27a9608574e18e99d3420dcfd
17327 F20101123_AADAJY yoon_c_Page_083.QC.jpg
9fc971612f642cdbe67693b4e975930e
77b9360b4da0325a28ec028d2f7b65f1ab101c3f
9524 F20101123_AADALB yoon_c_Page_110.QC.jpg
34d5a729cc8665f3459fe12ce3b1b772
81c8870750bf091f0ffdc710c23016e6f92d4ecf
47541 F20101123_AADAKN yoon_c_Page_059.pro
89d504f241ecd75070d91e3e1e237018
00b523925d9585e35768464574a7408918dfb34a
24864 F20101123_AADAJZ yoon_c_Page_001.jp2
a35c220a67383947cfcf1cc0d9dba57c
aedd013cf70d4932a165e880c329acd15e7d436a
21396 F20101123_AADALC yoon_c_Page_050.QC.jpg
170a6304fa2d3336a04c3bee9a0b1436
c3539d1e7dd829f4512687d7eef42643de46bc8b
1727 F20101123_AADAKO yoon_c_Page_041.txt
0a8d52bc4193627efd53a69e7cdbad37
e8abafaafb96b9673902b24d2407d18f1bd40c0c
F20101123_AADALD yoon_c_Page_070.tif
1ff0dbc34397f09e13fd9d4675517959
c85cb9d8b84a68f21eab67acfee8c03df81fb034
54188 F20101123_AADAKP yoon_c_Page_065.jpg
74e944a2eda8ec201920df318f5f41f4
6e27fe5d91f0ae8348b7d88f667fa3760ccdb678
52295 F20101123_AADALE yoon_c_Page_092.jpg
795c610adb9434bcf0aaae3e722e049b
6257e60bdf9181e62bc522308f5a7835b370565e
F20101123_AADAKQ yoon_c_Page_098.tif
1acfe5323b77e260ce52d7525ef48887
eeeba4138dd831fd86680fb8d34f8c80881a32ba
13607 F20101123_AADALF yoon_c_Page_103.QC.jpg
40cb348200599ed255daa556212f8851
c73e2676d0c870087887d0972825a228560e9a94
F20101123_AADAKR yoon_c_Page_091.tif
cf8085d75916c28081f3150f94d1982c
91af206c486234ad50bd6c893ea799941f0f02c0
95543 F20101123_AADALG yoon_c_Page_089.jp2
b8c5dd0e20dcfe9433ebb51544c5667a
2a7f614407b6f350ef9888788739c69a62a6b674
29848 F20101123_AADALH yoon_c_Page_111.jpg
e9bafbae8b36ff3aa7341e4d375f7c83
d6dcf2daeff3a4789ec59c7b7d7bcdc1cecc24ba
47880 F20101123_AADAKS yoon_c_Page_010.pro
00f25d727676bc11c81c649cc05a13bd
b0f6af552f24227c6e46bdef5023d227a6f9f071
89149 F20101123_AADALI yoon_c_Page_018.jp2
d281c6663a79e683f0423ebf52f199e9
5dfb6a278cf3f86ecceb4b08365a9a49d25a735b
F20101123_AADAKT yoon_c_Page_004.tif
43080a0e573b7acc4050ab61a1f5725f
1e5e77aa12e72fd45cd869f2725c79e23d87187e
95009 F20101123_AADALJ yoon_c_Page_090.jp2
460dd63cbff70c76911dff4dcbf21311
1b2f6ac6436a7acbe6ec86acf3304df3cc43e571
392 F20101123_AADAKU yoon_c_Page_053.txt
dfb15fa5c6ba1acf2ab38b363be11851
3f9b081bc51e5209c8c6959b56c1a6f34237cd92
60692 F20101123_AADALK yoon_c_Page_081.jpg
c0f40cdc84bdf83c9e9a2b321e43bc3d
d31d66f38cdf13261e83d38e646db7fda9e71463
529 F20101123_AADAKV yoon_c_Page_095.txt
080b037e0f41b968e6624bb4100faf8b
b195fcd28b70a2f793a17ee9abbac5c0bea0b0ad
6214 F20101123_AADALL yoon_c_Page_051thm.jpg
dd493accd9c4cc6d605635d7f200daff
53846b6e3c007751b505b17e03d4433874bc0078
6202 F20101123_AADAKW yoon_c_Page_090thm.jpg
0f5363fceba7e6b0810269568acc127d
f34de4bd3beda57f249557d8cf3de015a5530149
F20101123_AADAMA yoon_c_Page_111.tif
7da8cdf069dcfc831d96957b0de3fd09
c150f5762dadc7c39ecfaecd1684b84595615d5f
23200 F20101123_AADALM yoon_c_Page_093.pro
8f474a19219fba6f10fab039667c4feb
1819a5115c2b7afdad5a1e938fbacbee2b1ec1a7
11797 F20101123_AADAKX yoon_c_Page_007.QC.jpg
a7ff91ee2586a6a44c58848416f3ca59
14a5b64c103cd708d22d1a050bcf80844222a111
17655 F20101123_AADAMB yoon_c_Page_110.pro
daad3c714bf73eea3ab35f4a5688e471
ce593b0ebc081e6ce60dc3d0e0d60edcdae813c6
6190 F20101123_AADALN yoon_c_Page_012.pro
60f75e2f3bbd0893569df696808c979c
48509375c7db9953697f8e44db26a17482490379
8369 F20101123_AADAKY yoon_c_Page_001.pro
a1948d6899d412084b7724191d519deb
126767bc477c91b1c2d05e581015fcd33ba3e1bb
1686 F20101123_AADAMC yoon_c_Page_022.txt
0187ee97063d848008b5df9d349a8c65
6a8aa136f08dee01b63941facbdd5a22708440c0
52226 F20101123_AADALO yoon_c_Page_076.jpg
52955851ed0d5c21e7fb46f98e5e023f
8e7174eaa4c4709186e198891f9cc239bf18370c
43115 F20101123_AADAKZ yoon_c_Page_084.jpg
35725ee5345a7edf67d94193afacb1ad
d5efa6b010ed1817b81fa654633496e4c72f8901
115259 F20101123_AADAMD yoon_c_Page_106.jp2
26e649e5914a52d780dea3d934b6dcb6
548f2f19b9acfd03000d01bb0c344b4e148a0177
6255 F20101123_AADALP yoon_c_Page_039thm.jpg
ba8f03141299a77215c204dedc61f098
dc415270a6d02876937e745bbfe2e27f5855a7f5
F20101123_AADAME yoon_c_Page_048.tif
7bd0a4a32e777d56fc85ce2889700626
029b5b23689a6bbaaf2c5eff744c3fb8fe5ab6bd
F20101123_AADALQ yoon_c_Page_077.tif
078e6939051a03c45bd5c8542a3339e4
28b10d4f0888ab32ec75d456f4f80861a0939f1d
1919 F20101123_AADAMF yoon_c_Page_033.txt
66ad8d6e72ef928af3a6635ad69cfb04
f3edf2134956211e8605a00b22e0f6ee70a92c8a
F20101123_AADALR yoon_c_Page_061.tif
9dc89aa2a8599b2b2aa814fe2c2d326b
4618f040dcfeb821cb19380c34d97ac37018aa11
25510 F20101123_AADAMG yoon_c_Page_107.QC.jpg
32e87f88408786c0c09207a7b7423542
acde893dd39d08e94f80e18fbd247cd2869809f6
22916 F20101123_AADALS yoon_c_Page_034.QC.jpg
079859fcebd0d0fcc4038d3790453394
0b4fa6171a0261515c974c9980ff3d11fe55d4ba
12715 F20101123_AADAMH yoon_c_Page_093.QC.jpg
469f17469746ecfcbafd12145f14e9ec
d694331ab46c7121921902f7a49e4b014bb1e740
22459 F20101123_AADAMI yoon_c_Page_058.QC.jpg
568d0009dab5812921f259e52252c4aa
864c48df45d334431b8512a49960eade9ef4e4a7
6060 F20101123_AADALT yoon_c_Page_089thm.jpg
3287851925834942a21781ee9b7fe41e
51659df861414daafabb294deb95284bd720db2a
48539 F20101123_AADAMJ yoon_c_Page_096.pro
52d9e278e513b28fdb8aaa001e17da90
6e61b042cf7be3dd77c31309d905fe6c1a8a1a94
6505 F20101123_AADALU yoon_c_Page_059thm.jpg
328a54f0aa4c15ce82c1cb6a20dac7bc
5cb1350418b868b0a53e406b2cd23e4955fd720f
81107 F20101123_AADAMK yoon_c_Page_061.jp2
22704bbb47c904af090471bc614cadba
4ad0472c002043f914b4776716e1da00a6d499ff
F20101123_AADALV yoon_c_Page_041.tif
b03bafbe42dd577f29e660cc196a4851
fe17b3718c3cae477eb036ba93811bdb54062d51
5209 F20101123_AADAML yoon_c_Page_011thm.jpg
80aab9ca13d97cb28ec6e44759cc1896
0070abf3924b9380f88242963aa1ad9b4afd47ee
1353 F20101123_AADALW yoon_c_Page_029.txt
7272b652cc63f048a63a9c3f642f549b
a9b352ae637f42ee00d8bd03d5109497e1d79e87
21319 F20101123_AADAMM yoon_c_Page_090.QC.jpg
2b5848302295b355acd7efb937488aad
326c0769d010a3858faae38f8bdcb98b88bbe1be
811 F20101123_AADALX yoon_c_Page_067.txt
40381d3d2970e3104fda13fd9617019b
9e72b54be41e751e012f92effd09a83a17b7d7ae
3489 F20101123_AADANA yoon_c_Page_099thm.jpg
7e5b97c7389e42cd6063b96939562eb7
f06627d8c329a151e0305f90b30f90e75df1ec50
1899 F20101123_AADAMN yoon_c_Page_039.txt
c9995a2ee82a204dbaef2e8636c60522
9df1ee808c3c97d380b6e6f1ef6a7e18027decc1
F20101123_AADALY yoon_c_Page_099.tif
3cb12a0a3cfd0c9b519adc38d336dc93
b5f3d18167b2bf544a0e370a391c23ae0da09d72
F20101123_AADANB yoon_c_Page_074.tif
8a0215f507a0bc798e240ad8f27c6109
72ef00a5be2563b6028ec0502c3a0486e3b55ea4
42008 F20101123_AADAMO yoon_c_Page_008.pro
f355d75c837d0edbb4efd842f0e6ef5f
9736f32b084dff2be1d68bb10ca210f857ec6647
19300 F20101123_AADALZ yoon_c_Page_037.QC.jpg
f46db7f5eebb927df37b4874c5390009
af1a8723ba00e67f79e098a3cf5f149609d30f35
1955 F20101123_AADANC yoon_c_Page_096.txt
56f32c427816e4929ab74cc7823ae752
b4133c1cc1374884224ba176da18c5e113f3f197
669 F20101123_AADAMP yoon_c_Page_111.txt
6f7487735937ccd565be990730c7b49b
333d2ac8cb41f145f9db9cc5d397f24ab8355ccc
5679 F20101123_AADAND yoon_c_Page_016thm.jpg
ac3ff2dee842137d941c3959a350fff4
1210478f9812e8a8cb337d687f2c8f4915307110
6462 F20101123_AADAMQ yoon_c_Page_055thm.jpg
c9c2e10f369081728c415b65f4a8d8ec
fe6706c718aa5922a57fc08d4085ac939911729f
48135 F20101123_AADANE yoon_c_Page_057.pro
7ac8e6133e2934758ba6a9dff794ec2f
552cdc705fd4a7a4d3f6d560b76370fda3aed9b2
2987 F20101123_AADAMR yoon_c_Page_111thm.jpg
1083315d50b7ea0b1042b9edf56039ac
434eb6285be69ca9ae5fb51a9693e7c1a39bd62f
100344 F20101123_AADANF yoon_c_Page_044.jp2
cafc82381ba4688d83867a8048d514d9
03c0005ddf61a15f45c485711322c90b323db889
5103 F20101123_AADAMS yoon_c_Page_010thm.jpg
f73417fdd79630ce96d5e7f97d7adaae
9eab302920bb20c32cbea4e41085fc29ef8f1374
45387 F20101123_AADANG yoon_c_Page_048.pro
709f60251318c4430a84cfdcd279f22b
d00446e4938974ece2a76b9e116b936b922d5e6b
132007 F20101123_AADAMT yoon_c_Page_108.jp2
49632696f94788470bd41776bf71bc0e
71c8e6251fd4e6ed8c9e0640b532dcb43a286996
1929 F20101123_AADANH yoon_c_Page_024.txt
6cdbaef49548e3f2f0a9d60fe7dbc56a
25edf8d4455b160a92b7dc0bcd215d8b6e31b0c3
49765 F20101123_AADANI yoon_c_Page_102.jpg
31dec7b73181271dc77819b8d9201e4d
7790afa32abfd0446a99f57fdf356a4fb0bbd19e
20762 F20101123_AADAMU yoon_c_Page_071.QC.jpg
cc52bd99be7edd74066681109716604b
76ec8c2af2d0c859cf56594a3337b2a91083887d
77078 F20101123_AADANJ yoon_c_Page_014.jpg
55e4ff9ea23e37bf03f39d4013fa0ed9
165d931e6ce5acbdde10268785f9db66794977e0
F20101123_AADAMV yoon_c_Page_038.tif
e838768cb3087a5183e5345ec1bd8279
126fce6eaf9460806abd021c22ca6c2cf38499ea
41516 F20101123_AADANK yoon_c_Page_007.pro
2033613d4ceb6aabcbcb9d6de4534f7b
38da68a6c95e49ba70cfaf31dbd6fc9d5fc8b800
20468 F20101123_AADAMW yoon_c_Page_086.QC.jpg
b360467d962523ff9c63e1caf3ff380e
1d564e79f43a139fe7ab5ae554f80bda4f30fbb9
15311 F20101123_AADANL yoon_c_Page_077.QC.jpg
43695f895b4d566486c1cd3d50da092e
bd00cd4b44a879cf48159ce3df28ec062dcb8a3e
70771 F20101123_AADAOA yoon_c_Page_005.jpg
c50f73dbad0e4a6d1c0025afc37f6d97
bed4742c879d63689642387ffa1ee99e151b2d6c
893 F20101123_AADANM yoon_c_Page_063.txt
af4de5c8e7c612c982d4daa7eb8aee82
460343661562beba44cd4dd889452715ef6f8656
F20101123_AADAMX yoon_c_Page_082.tif
318d6a6ede8dbaf66b7e1ccf0420d65b
8733e9d6c80709bb430827c14f2215d8d2fbe456
104264 F20101123_AADAOB yoon_c_Page_006.jpg
a85d740599068cfc4d8c3f9729405f01
0fe60adadc84fa46bd31c3ea7b4645dbe7a88cfb
48506 F20101123_AADANN yoon_c_Page_024.pro
7126713aa8d173a2022a4a5578204f11
3cc0c25c1443334b9ea33d18cf6be998f5e6da88
2093 F20101123_AADAMY yoon_c_Page_056.txt
04e05c6831c0cb3f3131fbbebe5dcaf6
8c4595d8b8af3003b02839e4d3d24052eb91dd8d
43529 F20101123_AADAOC yoon_c_Page_007.jpg
f20882684bf634272df10672ef9b235c
19f0767377586804924a246fdf88f5080fc97c2c
F20101123_AADANO yoon_c_Page_073.tif
a8462f12c628b1289a453be2ea8bae1b
4d318e9105772bd27248307b85dd4004a97195f5
1657 F20101123_AADAMZ yoon_c_Page_079.txt
3ae793afebfec4a2c0a356b41f3db0fe
4d9d7a6615da6dddc47a130e64b865739d918e73
46154 F20101123_AADAOD yoon_c_Page_008.jpg
ff03005a763d6588fd3f5bf0e6e39498
2d6c0cdf7be09b74fbf832b66ac2e8d912075080
132682 F20101123_AADANP yoon_c_Page_107.jp2
b4b1b767edcc35f8e197faa65cbeaba3
5941a68046a9024785a0f530e0e1aaff2e03f280
59243 F20101123_AADAOE yoon_c_Page_010.jpg
b2472a1a746c9a8edef836b3b533cc15
2d5bc10ca1e9668e9054e8aea4223b9bbd78b528
6334 F20101123_AADANQ yoon_c_Page_024thm.jpg
09c2b1edbbbe21112aae1d02e6df9e96
81be975667c44c9612039c2ee25a8b29d866cc0a
57455 F20101123_AADAOF yoon_c_Page_011.jpg
a86a615c09aeafe1b8e73538b3fad4ea
55c5992b2a34a530347ee235e263261fc4548628
73098 F20101123_AADANR yoon_c_Page_065.jp2
bb6e6b101d3c50f64f59858b0678c37f
cb2c820ee7cc5b5fe3eea8de430973a3cd7874d4
16534 F20101123_AADAOG yoon_c_Page_012.jpg
719ec9f0103a897381f707cd00403787
0f3b11cf5d11762b684c3316ee2fafe58b07c04e
49912 F20101123_AADANS yoon_c_Page_025.pro
cb617a9eb3b8eb9cfcac256dc7353375
66cd441912dc81af03f6b9d6fb117b0039c5930e
58381 F20101123_AADAOH yoon_c_Page_013.jpg
4dd2e50d3ba007a67f5ff1c0f29d05c0
bfdc3a188eedd33d04e0dc57289a9f8f5f0937e2
56856 F20101123_AADANT yoon_c_Page_040.jpg
691d16cf09bbba74916eae2a6987a671
bc55378b91f95509bfb8c79f68e8fb70e187f20f
65154 F20101123_AADAOI yoon_c_Page_015.jpg
e7b88f35cc767c3d95ad6213d2a6bf86
9455cc3198b2f11808d097731144f6eeb135dab6
165269 F20101123_AADANU UFE0011560_00001.xml
74b3742d4cd3ebb8f79f1c30dd78ea31
aea36df5fb11ec9195c8c8f5fd3b160bf6acf7b8
66854 F20101123_AADAOJ yoon_c_Page_016.jpg
5aea64fe9af1d69d01bfa15563857507
85367108770d3d17619811f16cb3c677ef19f86b
53405 F20101123_AADAOK yoon_c_Page_017.jpg
3b87977eaa12c1ff9d8ba60b0e43d6de
c2692a1a4b6426048f856420bacb144e208991bf
61066 F20101123_AADAOL yoon_c_Page_018.jpg
3e62f2ae562628056788e695075645e6
709b8caf3f8373435e62a5a0ef3a1cb7e839ecf9
66399 F20101123_AADAOM yoon_c_Page_019.jpg
f25f7df1b28f3da1eb69f09d0933a394
657a0f5f4dba97225d71a1acbb80b354049f3d15
24330 F20101123_AADANX yoon_c_Page_001.jpg
cfebdd120e8be4c7c40a7880a96ecea3
e849dcfc06d3b4d81a86e51e731368f44f1b94c9
71080 F20101123_AADAPA yoon_c_Page_035.jpg
98c2e4d576ec34e7b8abfced8f48cfe3
91c6444f1fda3fe6a7016baa189b6fdece92f843
49025 F20101123_AADAON yoon_c_Page_021.jpg
6f447b9cad63bbe1b6c3154c11c2761a
6585d2bfafafa3bf41834a64f7eb5653e9729de8
12303 F20101123_AADANY yoon_c_Page_003.jpg
27737ef8934141e214102e2c008b13cc
e86d0ee1f09d64117a0c8f736b7fa58cbf00ea49
25952 F20101123_AADAPB yoon_c_Page_036.jpg
93691df2cd5ee4e69d15774e7f2e387b
c46f394cab0b6d66e55b7c44513f4d218bdbee95
60746 F20101123_AADAOO yoon_c_Page_022.jpg
5db816162a47145883c7450a35fa2259
3a454bd2aee81a1e6b42712283690c8bf60b7188
33701 F20101123_AADANZ yoon_c_Page_004.jpg
88776d5cc4611423e4aa0e9b8a8f077b
8931d5c8b408ebcee67906456775839a49c6e1f2
67416 F20101123_AADAPC yoon_c_Page_039.jpg
fc3d180f58f272a5f1d3a825da66203b
fc6f359fc93278ac7a0935bc1f5bfc7e71175922
70296 F20101123_AADAOP yoon_c_Page_023.jpg
0b0e95d2536eb9ab813cf6e2b4dc7583
035d02283a1627d016eb757a1d3c329d1d89e86a
49369 F20101123_AADAPD yoon_c_Page_041.jpg
a70344e851bc563d2ff96725b6358852
5a5017f6083141f80c74b6dfeb02f1e0d8935789
68491 F20101123_AADAOQ yoon_c_Page_024.jpg
a7d13bac7d488569813e3e0239bb241e
78cb35a1378bdb0903880d9597b6cb378a858fcb
57698 F20101123_AADAPE yoon_c_Page_043.jpg
52bf3f40c1578d8f532688a09b82f86f
cc2585fb98da482dee14b5eb5ea7229762030f13
70877 F20101123_AADAOR yoon_c_Page_025.jpg
c2e95df161a1f17995fbd7a7235eb2f2
07913901c909b15e39d3505792df05ce72c8f6a0
53593 F20101123_AADAPF yoon_c_Page_045.jpg
e27c076e2edb43aa5b87340982cf4bea
df90125b3b688fb2440e734589072e7fb46b6a12
71854 F20101123_AADAOS yoon_c_Page_026.jpg
cd26019338b820290b4d84c7c745900b
2f7f301eeb8807342d6389b3faa6dc23cc5d1d86
71165 F20101123_AADAPG yoon_c_Page_046.jpg
1e6a8e12b86adff2976c38f038393d33
0aa68249b504a6787f2b69debe075ec8cbe28534
72984 F20101123_AADAOT yoon_c_Page_027.jpg
7ac8e0af6b4dd6af1c7e1ad164fb97e1
2995ce93d18919022561c8f5d4251c0f20989a58
49169 F20101123_AADAPH yoon_c_Page_047.jpg
0393ec8c1ffc08816bfd18353cbf305f
fcff16f9880410c149f5e15d10be0f1ad28851c8
51186 F20101123_AADAOU yoon_c_Page_029.jpg
468dccc202da645b95508bca887a4dce
f2f7a97640b88d840fd0664dca20c0be28248583
66764 F20101123_AADAPI yoon_c_Page_048.jpg
52bf9eb2856bbb34a994d9ae881fbf08
cd79c8cb603a1ffa96827db5be9b92bd0ac92c19
63334 F20101123_AADAOV yoon_c_Page_030.jpg
123f202ec36330bcdd38573a0d66b7a4
c9f7d0aebc5328e095b8b6197d6cf11b6cd87d8e
57301 F20101123_AADAPJ yoon_c_Page_049.jpg
ed04f8c6b81b7213ea196ee0525dc5d3
990dad6ae959020c5d7161eb438c86ce4ade1055
67679 F20101123_AADAPK yoon_c_Page_050.jpg
d257975605939a04abfcc1675975d9a1
92eb0f24bbe945732fbad0b961f910aa32f51006
67017 F20101123_AADAOW yoon_c_Page_031.jpg
0ad74202902f50c3f0d5defa66287e05
11a0584d216b856b8e81d478ade48f3136bf87f5
63964 F20101123_AADAPL yoon_c_Page_051.jpg
44e8a88f3ece0b9027b1405790842995
6dc471869b95d68106296ebf30b5c0d34f4680cb
69107 F20101123_AADAOX yoon_c_Page_032.jpg
b3d8f24389f16ea13461ae8d25162ea1
7d57c53e2525328e6d60cf733f97b32f8e7a95da
56017 F20101123_AADAQA yoon_c_Page_073.jpg
db574be04ec72da1e6a283d654001648
1e8ef56522b21e5367a9fc15d53284391869c23a
19918 F20101123_AADAPM yoon_c_Page_053.jpg
1621e667d36b9c7e4a7925c77fb2411c
4147c8f8bb9a9845257875e34730759826dc3c20
69159 F20101123_AADAOY yoon_c_Page_033.jpg
66218d2f9b7f2b198217ca5be59f87b3
bb9eb5cfdce21d42c4e3c4fa4b6c81a1b2df3a64
58070 F20101123_AADAQB yoon_c_Page_074.jpg
bb0005877f3ead87dd441780a4277511
d086b75ba58115fc275c510d59b36ee54b500c25
57422 F20101123_AADAPN yoon_c_Page_054.jpg
f42baa1a5c7d91815de117a61751cf51
60e8cea59c4db937d7574bd7e86a9301ada332fe
71184 F20101123_AADAOZ yoon_c_Page_034.jpg
e9b47b04a8d6586ca47d9a69df7e8986
7fd0a79b107607141eab3915d82207af02f07524
51788 F20101123_AADAQC yoon_c_Page_075.jpg
089575314b8f17c7b8a1061230e57b0d
a1afb289b7efdb84d3104730e8ce56578f7d0b6b
71047 F20101123_AADAPO yoon_c_Page_055.jpg
791307f5eb3947f7283ce67a7d83c6e7
250203da59bfe8af062323684b21bd7587880762
67070 F20101123_AADAQD yoon_c_Page_078.jpg
f597f1a87d91a835252b1f9df129b438
31ac91a58accd45d2c78e917ab82e65951c5294e
68556 F20101123_AADAPP yoon_c_Page_057.jpg
eff6ed7ce1a736bc53dfb81e27830a19
ae68d509f7e50e8e58f60d7324fe3dcf68c2a042
57167 F20101123_AADAQE yoon_c_Page_079.jpg
71b7b2385d2b63d4f0b0f282d910f05a
19dd4a8d72915bc4acd239ec0660040ac82794d8
69040 F20101123_AADAPQ yoon_c_Page_058.jpg
65ea2f3d61368272b936e0fc428408d8
5e2551f63d7aea154a257834ebf2974adeba308e
58480 F20101123_AADAQF yoon_c_Page_080.jpg
381aa695115a945690a833d45c5c9232
83b242996bb3933f673732bc93566a70e0a53013
54822 F20101123_AADAPR yoon_c_Page_061.jpg
0ab29d50c6f72f12a02913bd9193231c
66488a6836d9091f0d2574ee25f954d58056c5ec
67729 F20101123_AADAQG yoon_c_Page_082.jpg
602214e6c28efb6004f8c6cf50ab05a6
5ccec33f6499b437f6824282a5b0097bf829eaff
41888 F20101123_AADAPS yoon_c_Page_063.jpg
330ad877d9a9e025ba5ab82347ddc711
18c6e06e782aef91adff59b6267397c5b06811ec
52591 F20101123_AADAQH yoon_c_Page_083.jpg
717675c8da8766574063b4c4f012ca16
c4dced2a8ca6d1561bc49155c3fb4a5fbea13297
69244 F20101123_AADAPT yoon_c_Page_064.jpg
36422c5cbeebee7d79928050862554bd
cac3fe748016027b7c789e74537e5927839e1695
57506 F20101123_AADAQI yoon_c_Page_085.jpg
1f3cf0898630c897d7069c6af98e0f28
b13b463f9607d4e6dd0b817f35a7737983e3c719
44280 F20101123_AADAPU yoon_c_Page_066.jpg
17e18424496aa123a021946d033a4123
c8c17f6bb5feb5e5abd5d3018abef1d545863147
62567 F20101123_AADAQJ yoon_c_Page_086.jpg
6eb39767772ebe77ad14bda22c84c344
6f476f8539601732af7d9fa40e03be7b9d1e63fd
38182 F20101123_AADAPV yoon_c_Page_067.jpg
5107fecf539a1a50124922949ea827d9
9eea0d22e01735be5a22ff1334b687e9574701a3
43461 F20101123_AADAQK yoon_c_Page_087.jpg
b9a84a07f0882e6ce7ade58db34fa732
0fd2f35cb5a2c6af04ea7dfb966ac0b2751dce8c
65938 F20101123_AADAPW yoon_c_Page_068.jpg
52525cc2527d1ada7e8fa3cfa1bf9849
cb32dfe9aba67025411226b69a6ee4168f31b95f
41155 F20101123_AADAQL yoon_c_Page_088.jpg
3e0c6aa7bf9bebbcfe4a61cd2abf2021
54ddcb01f1687ab104359b69208859e77aab45c5
87718 F20101123_AADARA yoon_c_Page_107.jpg
f2ca1554390770309d0face58769f79d
adb47b8f862d13b5c7f71b528bcd363ccb088628
63990 F20101123_AADAQM yoon_c_Page_089.jpg
b858773845571244ecf6bd8f5afc5f93
8c1990ba47aaae0dfd04d8a7ad576cffe9226096
63008 F20101123_AADAPX yoon_c_Page_069.jpg
b251083b517c21437e7dc559220ca9dc
5fd88ee52983da3e3a266f0cdb988c11e742c801
84834 F20101123_AADARB yoon_c_Page_109.jpg
09ad52d6c7d3aad59c5dc81fe9a58bff
314ca8db4ab76740f3130620f911d6346d592499
65647 F20101123_AADAQN yoon_c_Page_090.jpg
fd09672d569e28e9e1e42920f5d15059
7474138a0226c855bf0487f18e1abc5bbee54556
60198 F20101123_AADAPY yoon_c_Page_070.jpg
b3c015969fa4f8e4bd5cf0ed768e116d
64619824e316af1c21b907946af68525dd6cf563
31513 F20101123_AADARC yoon_c_Page_110.jpg
bf36b9aefd581a32aba0e7a31db37bc2
6d80fb4714bb2125ca0c1fa1019d7c611088b0ac
41463 F20101123_AADAQO yoon_c_Page_091.jpg
a6d7886ee4769906a16862880b9e27d0
ca1d2152b1f296fb555da2ddc31b7bd7c73fd219
64196 F20101123_AADAPZ yoon_c_Page_071.jpg
870c47c99037d2c6ea38e89280284773
f3465108ed049accf1f9e3a5c600d3779dceedcd
5710 F20101123_AADARD yoon_c_Page_002.jp2
532dfeef7f59fbedd7cb5e66de8f5687
f24cdfa224c69f91e96ebab0b1d44146ab3d1b74
38584 F20101123_AADAQP yoon_c_Page_093.jpg
bb4209e85ffd11685417b598ee864afc
bbb8c4c13ed30ace729105d7022684cd0a259908
8533 F20101123_AADARE yoon_c_Page_003.jp2
5af2d95b06738b19c94b1122b1398199
25319ca70076f0b019dbea463a4f2e0ae12c55d4
61952 F20101123_AADAQQ yoon_c_Page_094.jpg
3e306880e8aafc32ecfa14ef8510d3d7
5407b5d9777992665d2b93296017f28105cfaa7b
45373 F20101123_AADARF yoon_c_Page_004.jp2
91f076712fa53031f066f6520aa06618
e6e116a2ad1b7a528f2207cbd7608aade35d7655
45166 F20101123_AADAQR yoon_c_Page_095.jpg
2a3a127e1a7b4e1fee22be17bf47b44d
571fc2a622a6234868404a91ed90912997de612e
1051936 F20101123_AADARG yoon_c_Page_005.jp2
3dc313fd4f151ac28942d2495cdd6d06
43d53e9f311befca830050db170e9e2435189d9c
71136 F20101123_AADAQS yoon_c_Page_096.jpg
5ba7544a005a15ffe9df5480071f61ef
1a863c5ba84e9d916e6a4321b95d93d55795fbac
1051985 F20101123_AADARH yoon_c_Page_006.jp2
234a5a03788fad4f5bb04b3d3c8f2b37
9c9abafe02840152739d0cb14e8d4f6040299aee
69146 F20101123_AADAQT yoon_c_Page_097.jpg
67526adcb63649767d4cc595d97185a2
59a6d929bcab64fa07d610975c5920218f285e79
1051981 F20101123_AADARI yoon_c_Page_007.jp2
5058472c6ccff29b3b496dcdac210f09
f6b9d7534df61b99b428eff812a1835651ff44aa
60442 F20101123_AADAQU yoon_c_Page_098.jpg
1a5d8590cc326a806d498fa613059354
69a9ba38f0b5fa71d82ad0203fe2db84a88ba816
1051982 F20101123_AADARJ yoon_c_Page_009.jp2
5d2044b6f62104adb07be1d17dbf8663
e3bbc66a17add5252d1e22b9f906b660458b559e
39798 F20101123_AADAQV yoon_c_Page_099.jpg
bc8cf81bbd7d454bc942efba5b7f3b76
30a74129ad3312fed2185b681f1281a16560418d
1051980 F20101123_AADARK yoon_c_Page_010.jp2
35e4d1b155244297565170ed7de69d77
8f7795a1a980dac87908bfd0877043a020bc140b
54197 F20101123_AADAQW yoon_c_Page_100.jpg
b8617661620936b5807596c85beeb072
84efe2cc60c61bae7b21e70adb062bb13d156306
83440 F20101123_AADARL yoon_c_Page_011.jp2
0f4c9158c5a05c2c801ea8d5e362013f
eaf958bfe9ec13037975f99512ebc908e708fd9a
50460 F20101123_AADAQX yoon_c_Page_104.jpg
90bb96c0730242c48b21720c7536b9a8
f31774832c3b55962e19d89614433993b08d0f73
17501 F20101123_AADARM yoon_c_Page_012.jp2
06cf603d8b9c8a30eb95e93cbbaf0baf
8dee9e0e9ab462a64ee15fc9eabc33e122c30240
101288 F20101123_AADASA yoon_c_Page_031.jp2
f9e0acbfac24f41ab74e425b42fdf293
5fcdc7bdb21abe9e94e6cb252298e55cf098e2f2
86636 F20101123_AADARN yoon_c_Page_013.jp2
a50bfc8b1165b6ed59690114fa2e8114
71e7acb5810804ed26ba9571e00842c13962d83c
60478 F20101123_AADAQY yoon_c_Page_105.jpg
1596d2fefa905bc9b6281b7787f8e4fb
048eeafb25d2990cdb3462076f298becd3490798
105145 F20101123_AADASB yoon_c_Page_032.jp2
90789d890b6940ffa829d926f0608a26
5c30274f06da3554c9d2452854a909f159c03bf2
117672 F20101123_AADARO yoon_c_Page_014.jp2
ab833524d713adddc3d472e67201dacf
e4b0933cae8d391879f9fe91d9253ea423242134
76158 F20101123_AADAQZ yoon_c_Page_106.jpg
78a90cbbff5ce45243c2936b9c5b0197
eb0192c8d50b277973b3361be709e1828fcf8f76
106454 F20101123_AADASC yoon_c_Page_033.jp2
ff790a5c76bb5f18e9523462d5afcdb6
f31e652239aa027d3e92b3250471ed3adcff33e7
98431 F20101123_AADARP yoon_c_Page_015.jp2
86c2883dcfbd7189874d8d22d4e43166
4156710f40a1d750ef31cb55bc3ef777f3782206
108027 F20101123_AADASD yoon_c_Page_035.jp2
d55a29e0b8e64ca8cfc17d9709a00f7e
af465de8c95b39a3470afd32e8cf489c5c039b78
1051944 F20101123_AADARQ yoon_c_Page_016.jp2
3baddaf61bf544e7b52a970d6ece20db
045566c95eeda847a3dce100d0e102db8491afdc
34495 F20101123_AADASE yoon_c_Page_036.jp2
8b110e54b91e5137802dd068b6a22034
42bb43cf7b10e095663005b18d0e7c88c0b4ce28
101458 F20101123_AADARR yoon_c_Page_019.jp2
6bc4055c54f28c888d837c6334fc6be7
ae799d982752b52c29df150fff28566e70f75ef0
87775 F20101123_AADASF yoon_c_Page_037.jp2
1c3935a560ca69b5f2402e8acc5045e1
811b8cf2339fcbdf21112a53148ca1e70b639487
81299 F20101123_AADARS yoon_c_Page_020.jp2
e788044648a1b3f4a9839e2dc97f5c6c
6d21d9fef364ce700fe940d94ae42c17389e52cc
96018 F20101123_AADASG yoon_c_Page_038.jp2
3e4718d09cd90567c1c94dcc95b67e40
c01499a2e85512066f0196e34ba016bca057f4a0
69050 F20101123_AADART yoon_c_Page_021.jp2
04d25b6d140aa4022858c41f7deef252
16280ad55e80ace9628f44c7de43da94ed5e9aa2
85202 F20101123_AADASH yoon_c_Page_040.jp2
0c84713d7f0ad42ca8d2bd3564c5dbc1
8302891ace3d96e5f2ff95efaa101f0efef7350f
104114 F20101123_AADARU yoon_c_Page_024.jp2
4c08100c0f9f305f6ff2d36d3eab706c
5bde6e0f2e83f75f8375322edff7f0d189a3a972
72963 F20101123_AADASI yoon_c_Page_041.jp2
430d7e0fc4a723d15ee5558b0d225eb9
f6d89f1786486e01aa54cd7885c0dc1b0f2f90d7
813485 F20101123_AADASJ yoon_c_Page_042.jp2
8e00bf6948c31953e27404fa354e4bc1
7d4278f0c4af2f14f084d431a1c9225c4b0ff88d
108689 F20101123_AADARV yoon_c_Page_026.jp2
3fce322fa273f3bbd1e7154e362151a5
3c6651aa3d2a563b19862a3037fb711fe4649be0
670172 F20101123_AADASK yoon_c_Page_047.jp2
02a8b407102c175231a7eb56f2fb4c33
a267465d79efb18bffdcbfdb0dbedb3bbff3168b
109618 F20101123_AADARW yoon_c_Page_027.jp2
68ef90b7dd7fe7bee4831646398733f6
5ea536e2479e185d3914528228cad0bbe79cf552
101505 F20101123_AADASL yoon_c_Page_048.jp2
0b408d046f5089f294faeae6b118b0f5
c133021b12a49ea2e5102b3c9da4eeebfb370add
87903 F20101123_AADARX yoon_c_Page_028.jp2
a4998563c1502288cbb5b9d99e60191b
dd201c4f2df349f154112667fb6fdab3634beb6e
98416 F20101123_AADATA yoon_c_Page_068.jp2
750ad8886ecf9987b38c363ad49219df
c6e71e21f2552502cc8673ecca47f382aa2949a0
87426 F20101123_AADASM yoon_c_Page_049.jp2
1c0bf0ec33c51df587bee5d50f8bbf07
498c29c89ffc1b3f9501c1e48f522de52cc9faa0
73603 F20101123_AADARY yoon_c_Page_029.jp2
58e1fad453db40f8e07e6b0f3c84d1d2
5477b87875f77959658f2d0a6ac2428a825a9c7f
92251 F20101123_AADATB yoon_c_Page_069.jp2
041011dfc1e1c498409445ea862d417b
fd24c68d092e2c3aa46e26bda5e105e001670103
101397 F20101123_AADASN yoon_c_Page_050.jp2
9bd8113982ebe184d751ce860aeed636
3527050c16fd23cebbe721b19a4d3aa10078d2df
91197 F20101123_AADATC yoon_c_Page_070.jp2
906e2f1d65a744d30f3d2c969e0e6cb5
a23faf40d160a3cb252c55435709d73bda29c35f
97362 F20101123_AADASO yoon_c_Page_051.jp2
c150bf8dfc8c29882e4b72944aff5bb6
da875788aec661f25d12b9f3a10fcc2eb7d1c795
95097 F20101123_AADARZ yoon_c_Page_030.jp2
cbdd8926b42206b944e125c27614d707
6271898808f58f19d03eb5483b9a9507de10800e
33869 F20101123_AADATD yoon_c_Page_072.jp2
a938d19d73c5b87bcf8b99b1f198b3cb
6f57f06e7532ca86490ca965b0cc6e4ac9a7033f
95516 F20101123_AADASP yoon_c_Page_052.jp2
17226994f430bfc3eea55b9fb0a000c2
cbdba38bcf1e57e808d8b5bb63a1d863ef28a59d
83803 F20101123_AADATE yoon_c_Page_073.jp2
0117b57912dbbfbe6e568932fec6fd23
24fcae72cfba9f75be05da90db1d03256ac46f54
23023 F20101123_AADASQ yoon_c_Page_053.jp2
bccd260ca40a0bf04315f75f2df23b0c
231e8199dbb8c6a50c20fe354a393b093747a38e
86274 F20101123_AADATF yoon_c_Page_074.jp2
eef4e4bc8f5b7e62f7f25b742de4d128
d9eac6ba5c3e90d8454c363944e87d4db6467040
993211 F20101123_AADASR yoon_c_Page_054.jp2
9b120fd282c45a18830fcad784003f09
e54b5f8b708950ca503f140d3d55861f131d04ab
78255 F20101123_AADATG yoon_c_Page_076.jp2
100076378e290dff41ea9c9a59dfed91
cddf09b5b1cc2bfecc68fdcb857ca79cd976d412
107077 F20101123_AADASS yoon_c_Page_055.jp2
f16e3b3b2431b069c793b0266f302000
799bca812a162662269144be21084f66afe786d3
78384 F20101123_AADATH yoon_c_Page_079.jp2
8b8eec1cd99f151746f77e6304a7d3a6
f3373aca834bc8846df300237f2791b0b3962a62
104783 F20101123_AADAST yoon_c_Page_057.jp2
e82936cdd2d4addfaefe436b198ca2ab
65e4227ced4f7eab180dfc5f49e25600c54afb4c
816904 F20101123_AADATI yoon_c_Page_080.jp2
e43f2e99c9ddc740e09874ca390319c0
d71a7458386a4c8722212eba2f4c6896e521ded2
105399 F20101123_AADASU yoon_c_Page_058.jp2
bf0ce781b2b046c899f4114b8a95ccea
e4d5121581d6fe1fa161ef74a780392e185b4d9d
88370 F20101123_AADATJ yoon_c_Page_081.jp2
495ef89e9868bd83e8ad188ae5f26e3f
6ad7151eaeb56e1754a6eeb19db4284dcd2a2573
104819 F20101123_AADASV yoon_c_Page_059.jp2
7e1947f8489b31a4c060c298eaf68481
e11a39f771b0b6b51b215039af34ccada450abc9
103925 F20101123_AADATK yoon_c_Page_082.jp2
fbff9274ccbb181aba4d4632097769e7
c544a2faddbef5aa3e630d8d5428fb12aacb0af4
56049 F20101123_AADASW yoon_c_Page_060.jp2
68cae44c063712b7547e434ff4a2e88a
de78758774c7e083dabc97fe10299ab69a1918d3
78492 F20101123_AADATL yoon_c_Page_083.jp2
843a022655c9ba94e1f7b0d7e388f98f
e8f1de076d60c67cdab06853e1fd55a170193ebf
85372 F20101123_AADASX yoon_c_Page_062.jp2
a38b07c0b34ad1ba7bb166f03e2f5479
ebb22c40d12702dca0923519f4ac506474a7c74e
62981 F20101123_AADATM yoon_c_Page_084.jp2
ef40fa43b07edb52249e2155511df621
708473cc29e6f873bbcf07b66584eaac06667aa4
49324 F20101123_AADASY yoon_c_Page_066.jp2
a2e6d60a87e3e24443c7352dafa1da89
28518d1717b7944e02f4945fd067a4af781eadc0
69649 F20101123_AADAUA yoon_c_Page_104.jp2
78afcf8884207596c707a4a1dc734acf
30334b65dac428529f4e198c330e319bf8519587
836662 F20101123_AADATN yoon_c_Page_085.jp2
300730e2d2ed766db3be536e78110bb6
11f0f29910e868e23e33d80e404479c1ea9145ac
45928 F20101123_AADASZ yoon_c_Page_067.jp2
089cbe4cd37dd13ca8ae669189d576ae
8cf8eebec7c6fc53a19083bb87eb97173854f806
38286 F20101123_AADAUB yoon_c_Page_111.jp2
b2da178613680b2b1b09d58fbbbde9ef
c955a4f13f7914122f03c94c9a6136395ede9940
90706 F20101123_AADATO yoon_c_Page_086.jp2
58a4542be52c3ed82a2a6d07c728807a
f8d333a9b9185f04a69411308cbb73d3b3402cac
F20101123_AADAUC yoon_c_Page_001.tif
5123e8fd0b97b9991e5065933e56db4a
b487abbf49c0b7ad7bc93692045ca13ea56f637d
638009 F20101123_AADATP yoon_c_Page_091.jp2
989bb2df302a269cd28628714e582eaa
26fca7d97ed1bc68c3cb7d6ca6d25ef62fb3942b
F20101123_AADAUD yoon_c_Page_002.tif
99189925bb2d6e2fda9268fa9856d24c
946ddcdb9ae55f9ba6f25fe59b2570de60876c13
740399 F20101123_AADATQ yoon_c_Page_092.jp2
663e71bbab6dffc7d4e293de134a2646
8d3e757db70e9bf034ebfe7f5944eacec3aa5176
F20101123_AADAUE yoon_c_Page_003.tif
ecd9c549ebdce5be50b8ece0e38e0d8c
1d26007ea906eaee1f650e4042e050e1b8b2d5ad
55556 F20101123_AADATR yoon_c_Page_093.jp2
7156a3f81084f262acedf437cdf59ec6
d02a0f1cc94534671541947db330630a38f56c6b
15651 F20101123_AADBAA yoon_c_Page_111.pro
28b6a1383c634e43020cd00a6d8ee03f
46f3c316deb02be6a50e24f45291895ca4a6f0af
F20101123_AADAUF yoon_c_Page_005.tif
dd9e967f78386dce2d62229c524c0ca7
088d08e117096d0d8e1b2e3e475e74e7f2cfd141
94418 F20101123_AADATS yoon_c_Page_094.jp2
744787234f0bf5fc1ceac1df2a0a8031
adac1b6f6aeb434a64272a0f0193d6db791ec566
461 F20101123_AADBAB yoon_c_Page_001.txt
561e5c30516b2cffe5defee0a12770dd
b508bc75a286b059a02b5298272505dd6fe69043
F20101123_AADAUG yoon_c_Page_006.tif
819bfade8ad756bd9ac515d8e55a3a05
ebe8e78579113ff57aa5941bca25e5235cab1d92
107037 F20101123_AADATT yoon_c_Page_096.jp2
0c09b80dbc14cf50de657a6f43fb0fbe
68cf6b0771b4069c0103077112f6c19460be74f4
100 F20101123_AADBAC yoon_c_Page_002.txt
bd5bed8436555db0c9583acf05edcd1a
0b33a537661095196752e5e663f6ead1a0043200
F20101123_AADAUH yoon_c_Page_007.tif
e1b6258a4d0be4acda975192956751a9
4dd31b60c9a889898e1e67c647d0094df1871153
105947 F20101123_AADATU yoon_c_Page_097.jp2
f74e2f32706c6c3664fe8e68f4ea34b0
ef88b677c530ab5c559597f5ffbcb173fd8cc917
164 F20101123_AADBAD yoon_c_Page_003.txt
57217de4d9a22f8e630b5de6ba117a86
f284ea878ad20bb557166ab056da8cf8196a6765
F20101123_AADAUI yoon_c_Page_008.tif
a9988c7c29dcd6fd4990d9aa67c2c6fe
6f210ee1d76f13e4aa09b8643650124999c6e0ef
70562 F20101123_AADATV yoon_c_Page_098.jp2
bd503e2cf9ebfd0ce6c655a68736fb2c
8d2cc8e236cd8a2cce53059adb9a616a3b7f5658
833 F20101123_AADBAE yoon_c_Page_004.txt
c9b896085798fcec94db6898f5a5985c
e2fdc0847f161c21646bca825535888746bc738c
F20101123_AADAUJ yoon_c_Page_010.tif
0e9079685c0934a1dad7fe5ccecc37ca
07915ab7d5bf00c06879b4efdc49d38d891baba2
42482 F20101123_AADATW yoon_c_Page_099.jp2
d2ba69e88a2fec73db9583a2561f4ea4
c3b5936f89a91835aa65eb23e511223164a82783
3088 F20101123_AADBAF yoon_c_Page_005.txt
a17849900c7c875db1120a2018d431ae
b9b1975023b2ee67872ee0f3f2cc256f34f12c51
F20101123_AADAUK yoon_c_Page_011.tif
752595b92f6f83f789e080968e673183
e3dd30d2d43060bc6a29ff79cd771602b0a23553
79764 F20101123_AADATX yoon_c_Page_100.jp2
42f148aff816bd3fc0c2f1178dcbf64c
73fdc9b9b7de62ad610acedf3975398faa286690
3959 F20101123_AADBAG yoon_c_Page_006.txt
c2d058dee4754e54ab01d2280ae0ac34
dff2f072cf22ef6d9b881540af1a5d6055076ac2
F20101123_AADAUL yoon_c_Page_012.tif
2ea71568a49f2786966f568a76cd316f
48370fe7ca5ee4ab08c274b787330f9ea4019ece
92598 F20101123_AADATY yoon_c_Page_101.jp2
e7d23f08712fb5b0ecd567a46e572c93
2482fb0e3c720ac27937c9539b493cbf6e0348a7
F20101123_AADAVA yoon_c_Page_031.tif
67badc705248bcfea3afbeafab44dd9d
991219d6ddbe454c159a1d619f5436b988186d03
F20101123_AADAUM yoon_c_Page_013.tif
fccfb00dc6ce3ebe86d7d8be186d81ee
421a9b8b2cd70ac048873e4b617b6a83e0efa23c
69312 F20101123_AADATZ yoon_c_Page_102.jp2
a2074c70bc7d53d28cef4ab2f6ca6154
764f9751f46eed938a4e6c3eac39f9534aa08489
1675 F20101123_AADBAH yoon_c_Page_007.txt
bb99d310a588b872490b2b8cdcb89251
e3cf799f5a2f3d23882c60bf138d4bfbf4fa88df
F20101123_AADAVB yoon_c_Page_032.tif
9b6cb937f70c2725b7c94b915f65540e
ce8e9034a3fb0c2d671f914ea4bfd60b453f88f5
F20101123_AADAUN yoon_c_Page_014.tif
86930b1e29ffa0c62f83f682133ead09
93b5be85b8d4279d628e75532077313f9da34830
1743 F20101123_AADBAI yoon_c_Page_008.txt
9f6e3c4e8a744ee83786d72f43dd0d68
a27ff87490c95618d1000f7b5ad8194ad980624b
F20101123_AADAVC yoon_c_Page_033.tif
77e23b4c153a92195d3f593f00165c13
9bfcc428135689605d70fab15b830fd5592122bb
F20101123_AADAUO yoon_c_Page_015.tif
77623f08087c65cc0da04cbb323d77e0
9a75c53d21c29881c9309649458216fa4af4a0ce
2295 F20101123_AADBAJ yoon_c_Page_009.txt
d65d979902fe49bc7204bffc1ddfec7a
7489a55e36e56423cb77edd8f20f0210fdf66fdc
F20101123_AADAVD yoon_c_Page_034.tif
450569f1378fe35ab3f6e6b3fe0afa3b
1fd6221f2c8c9b1e8d62e5f8e399dd1561009e32
F20101123_AADAUP yoon_c_Page_016.tif
6c02b52bca25fdc3c29ffd4fc372a4e2
34a3c27d836cfd9bfd14e743869090aff56347d0
1894 F20101123_AADBAK yoon_c_Page_010.txt
a45f7a81ea2a8a3cf6c20de2a6f0d6cb
d1cd855186a5be2d80546d073ea2a05bd268b0b1
F20101123_AADAVE yoon_c_Page_035.tif
460e671361ea438e1d88de6652c183f6
cd05a9894dcf04f92654710f5ba286d47ec19ff3
F20101123_AADAUQ yoon_c_Page_017.tif
f3299b474b1793868f84435f962c3321
171acb4a825ed59a38384646dec886b14df0ca9e
1628 F20101123_AADBAL yoon_c_Page_011.txt
bd97674572fc0c7b4e983795524da7c3
e7107b267cf6045d3f7cf5aa084fffd2eecf2e0b
F20101123_AADAVF yoon_c_Page_037.tif
f8dc746aa32dd8d8a4c5012a5b65899b
4b78cc17c9ea8e0d2976aac3961b65950cd6bc3b
F20101123_AADAUR yoon_c_Page_018.tif
b4fab5b6d9e655e92baa5cc5d8532479
a2463acdaa169b90e0ac12125c574544311f957f
1864 F20101123_AADBBA yoon_c_Page_031.txt
a911c68eb8259a30000031938196c317
a79b8753b72bf795be56b416faf71684cf9eb9b9
249 F20101123_AADBAM yoon_c_Page_012.txt
37dccbd24b3d29c066c52b12cf6268c1
f4c577fd52c074e0d94e458dcb17b8d8609a7764
F20101123_AADAVG yoon_c_Page_039.tif
7ec8f449345a81e9b78b37fd7553bbe1
9df08236907f9d55dc76606e10a5693f86e8f782
F20101123_AADAUS yoon_c_Page_019.tif
3b764a80e89c113aa79aa73cd7c7d452
d335b19c713715c18c890cfa9f6d2264f4d219b5
1879 F20101123_AADBBB yoon_c_Page_032.txt
85854b5f0c844d536e75dd2fe90ad752
4da0cdbb6d990c75c69f470e46dc397e7c159e40
1654 F20101123_AADBAN yoon_c_Page_013.txt
4994d70e00b1df0e20e66b4794c9d29c
2a84e9fba1330827640239c8eaa3528ecc621350
F20101123_AADAVH yoon_c_Page_042.tif
1d299e01a31435f2c8d38fccdd37cb5d
0f381fe9a06e6d7178de8265c8136b31247fc195
F20101123_AADAUT yoon_c_Page_020.tif
0172093d5fa90888a6e7be819681b5c8
3eb6af9389ada1cc87987047da783eea16fd18c3
1901 F20101123_AADBBC yoon_c_Page_034.txt
e240f62d29e7108320872c055ac6e601
a123a930b363ea714bbb91424a4a506485ea8ac6
2202 F20101123_AADBAO yoon_c_Page_014.txt
0741d2e038a465158dc9ccb876e924d5
0e9e5355837c38ff3a238593a27461c6a60be78b
F20101123_AADAVI yoon_c_Page_043.tif
08d27bdcb7046de3e617bcf866e65160
172e9b6734193f0f8cbd8d49af145e00485ff2da
F20101123_AADAUU yoon_c_Page_022.tif
9ecdc289a053fca4fefdd9952df6f55f
d5f2607c5747654563b38e251a99b7aebf55b55f
1938 F20101123_AADBBD yoon_c_Page_035.txt
ffd2e6b5dd03c61388faa0d4be08e4af
cc2abf87c76565a6cb0f26c3fa14185bedc862be
1185 F20101123_AADBAP yoon_c_Page_016.txt
04998c98c0c223756eb065b02702dcdd
7081cf5d507d6077d8815c101bc40cfee84f3556
F20101123_AADAVJ yoon_c_Page_044.tif
5d8fd831f156df664cc642f7031531ae
94784a142182b889577b2ea5707c897a4c19d6ba
F20101123_AADAUV yoon_c_Page_023.tif
8bd62637ab304e8db5cee272a17644e3
627ee4a35e112e4c07cc5e3ade2f66effb23534b
584 F20101123_AADBBE yoon_c_Page_036.txt
d97df512b4d962b13a1c89f1e5df114c
b6e7bc404fcf511fb4215eed4245c81e092c8db3
1497 F20101123_AADBAQ yoon_c_Page_017.txt
ba942599c84a0c55818903ceaf13574d
882db4b2330f2ea2ad80363e4de991b5c1fa7e4c
F20101123_AADAVK yoon_c_Page_046.tif
394cf6dcf9629eb48467812c0e6be30a
9f99546518125f706cdea9e132764a2ae7ff5b7a
F20101123_AADAUW yoon_c_Page_024.tif
ee72a960d29ac6961a6f2ea2de7e7768
3d937419389168fa7183170cfccc9f5cf45c0d38
1617 F20101123_AADBBF yoon_c_Page_037.txt
bd502536681cf85a9c0ffba3df3fc955
5cfe7847022fb5172520545d590648ea7e2cfea5
1720 F20101123_AADBAR yoon_c_Page_018.txt
d9bec67e698f822e21f20639bb12539d
fafac89d06ae3010e8ea6d0b8a1c0c12beb87465
F20101123_AADAVL yoon_c_Page_047.tif
3bb772e5ceaa355b2a54528b68478203
5f4747ba6ee3634f850ed4064dc5374d5f9c9072
F20101123_AADAUX yoon_c_Page_025.tif
7f32de37fc2f6a863e688b84e5b207a7
f9f9f7b5ffd9ddb1ca0eb05fd731e769e65dbce6
1749 F20101123_AADBBG yoon_c_Page_038.txt
008e6219e5b12dd9dc7ce2ff26bb0d97
6958527e6d8ff203ec7e4edf3e182de9d67b0a91
F20101123_AADAWA yoon_c_Page_068.tif
3e793ee23188e48287a5068459622f18
35d6948f960d5d820b880f956ccecf2610910e31
1866 F20101123_AADBAS yoon_c_Page_019.txt
1e2a50f0a508174357121f116bcec01a
88e24d286bb78ed3a3ce6c7cc0d00c304c4c12f6
F20101123_AADAVM yoon_c_Page_049.tif
2688e63d539c2070c60392c50a17bcfa
5442d016702fd06e2dda0f7870ff3f2a9c34a5da
F20101123_AADAUY yoon_c_Page_027.tif
130d631d877643b6738d49d1a45a4c61
3a5686b711132b92554ee70601a90f36c64302d5
1748 F20101123_AADBBH yoon_c_Page_040.txt
9bb4b0908687a7cc7a45132b1a39e103
7d58aba197b4163f052f0cf3e44afa826c87d161
F20101123_AADAWB yoon_c_Page_069.tif
5c9133705f4b827380041d9570d7d4fe
3d962452257ef34dc6fa43459d7ceaad880d4f99
1582 F20101123_AADBAT yoon_c_Page_020.txt
f95d48a9fd4e5efc13657e33f445bd30
b40d64086f87fd6cba4f14f4de27bf3cbe0b2d36
F20101123_AADAVN yoon_c_Page_050.tif
5447332bc19491185e86499b7a5228e5
6f163268415f93f5300249af1f6b83de9436fd75
F20101123_AADAUZ yoon_c_Page_028.tif
edc23d36a28c2c5818b62d1d4de095ea
7acda484d28613d96c168cf68c94c49a5751ad0e
F20101123_AADAWC yoon_c_Page_071.tif
fc2c74b57f27ed73a4a388564c32b661
2054e20202a9cc346f14931e212a8de6184ae6c4
1408 F20101123_AADBAU yoon_c_Page_021.txt
2a1f8dae40ca88787508baeaa4c6e935
839369806f5bb55b49c3a9960c39a6c0ebe675a5
F20101123_AADAVO yoon_c_Page_051.tif
f9e3577c3a8d35935703c7a1d0e3fb7e
727ceab3a4abd0a9ae0dd4eb5f8c54d22e941189
1356 F20101123_AADBBI yoon_c_Page_043.txt
48a8292b3866169b0ef6cce2046fad6e
b80fdaccfbe1f6ccd26c7c6e43ce7ab1bd0a5ca1
F20101123_AADAWD yoon_c_Page_072.tif
dedab6a29dd715439b19732583710b77
e3d143bcae126805079b1408dd8b6b7b8ff4baa7
1981 F20101123_AADBAV yoon_c_Page_025.txt
c2ad4a18d8f9a0e6f1d1e82220400155
308028d8f6f9fbdad1f3f27d44979c7be56ae337
F20101123_AADAVP yoon_c_Page_052.tif
c1cd2d8d30a7d43885ba8da26e1e29bf
5e31fb68424e02fd3814b30ea34a91f5b2739f2d
1859 F20101123_AADBBJ yoon_c_Page_044.txt
ef8eab9eb54dfe4017834820daf75e98
2a75f87e7e25dcc5a86e66c01437eebb7c7ac2a2
F20101123_AADAWE yoon_c_Page_076.tif
d409501e9ef46528f21790298631d532
557c4fc5353d216c0b27f45e0e9895bb6fcd781c
2027 F20101123_AADBAW yoon_c_Page_026.txt
844c8fd7a8249f90b6a619c7e5060b29
9feb0e8e7d6ee06e56e0f3c13769fcc4f4252997
F20101123_AADAVQ yoon_c_Page_053.tif
572a5e42650ecb651dd6f2c930e9ea26
4c0c981eaf912f4789e26e92db16864d89ff3b29
1286 F20101123_AADBBK yoon_c_Page_045.txt
a28dedf07fb625c7d47806bf10991536
95c94e31dd288c2d8f7adf9f275a44a01898c706
F20101123_AADAWF yoon_c_Page_078.tif
5a9616c3942d0902055b43fd8e4af3d8
881d77c82fdb689a137341668d7cb02d3ae55a31
F20101123_AADBAX yoon_c_Page_027.txt
410ecfa098ac21f011e1a26dcd63e560
93a0e886e9f1fa7643b24151f4089e61ecb23b78
F20101123_AADAVR yoon_c_Page_054.tif
fb2b69fa6e1166c1c0ed06e4ff9c5384
ab0ebfec6795d84bdd79a1ef62bc9280d490db20
1798 F20101123_AADBCA yoon_c_Page_065.txt
c40c220787874ded2626ad54d560aff0
ae1fd392bf547da65be90e64ef47f39c6e7d9946
1775 F20101123_AADBBL yoon_c_Page_046.txt
af85f448f119e87dd5d80a46da78ff2f
c943cfc7a14335578aea347ac6d7d657c4d5398c
F20101123_AADAWG yoon_c_Page_079.tif
19311ca4a6f6e4a3dd0f10d008145876
512399853b2c2186b4c9bd313e5c97720f9816c2
1801 F20101123_AADBAY yoon_c_Page_028.txt
a39e055e78c78b741b6502717a40e88d
392a36b5e20bd40b3ab539efb242517547e7b033
F20101123_AADAVS yoon_c_Page_055.tif
bafca52acf55f9a5b8d64a9cb2cbf33c
7bac00e7ff5d9b7f3f2fc22731d038bcd5cfae87
1984 F20101123_AADBCB yoon_c_Page_066.txt
8eaa6872deeab8f0b1b0d5a6d0fed4fd
8327b27813c220d1b0d235f8d6228fdf7b9994d1
1334 F20101123_AADBBM yoon_c_Page_047.txt
982657e131fa9de8cadf9744152768d6
8f662d5eb4132617357f3ecd6a8f99f3f1968de9
8423998 F20101123_AADAWH yoon_c_Page_080.tif
d52a4985b733e3db8027f77a5331b8dd
0f8ae8f8acf4f35f9d115c9a9e577554b31d9139
1756 F20101123_AADBAZ yoon_c_Page_030.txt
9ce4e1b89b05acb223543289f1f29c0f
1c9ef3cff0f942a0870667978c5dbbe607a70025
F20101123_AADAVT yoon_c_Page_057.tif
8cc932a10d6199b7508cbb7f7a58a463
33c101a16a43aeb66c3adc76fd7d8be55b5af8f3
2335 F20101123_AADBCC yoon_c_Page_068.txt
27a2635d55def37eef3b4afdacdf8924
67cf331b31a1918b44ca177ad1a3582e1ddf1519
1831 F20101123_AADBBN yoon_c_Page_048.txt
e1a35c1cad3735304dd482b9567732fc
86ec88926367c349b8020d9d81db919584fe09ed
F20101123_AADAWI yoon_c_Page_081.tif
4e24f5f114b87f4c0b88ff0fef979dbc
702409dfdef6574e854ee0348d6ffaa684eeb986
F20101123_AADAVU yoon_c_Page_058.tif
b1682d4239b3df0f88d0f42d93d5cff1
9cadfcee54782c648b6f82b6e76bd1b5d03d0a90
1595 F20101123_AADBCD yoon_c_Page_069.txt
f7d3c9194243872f74a5f32c88664142
62f51a6cba0e7d323a69f42a93be319483648748
1843 F20101123_AADBBO yoon_c_Page_049.txt
0d7b7ac07a0ccb10ee1866750bc3f906
519d71327790b76bcd7ac63e64e1a93b5b21aa29
F20101123_AADAWJ yoon_c_Page_083.tif
7dd2a99629e690471c25e978b68f38ee
b7608aa2f71429e9231210347029214a45fe07f0
F20101123_AADAVV yoon_c_Page_060.tif
3f235ce4b6efea4ebe097ea5c6367ff6
fc2421ee056f27362c21f60096007224a739c3d3
1789 F20101123_AADBCE yoon_c_Page_070.txt
4f839b442cc1a7ca326444bc7c7a96b9
b512b053abf78820f6daa94cce07cb14e07f3a99
1922 F20101123_AADBBP yoon_c_Page_050.txt
66bc573382f1142205afe65d3e571f0b
8f24708afaea13ceeee97088225e87ebff185462
F20101123_AADAWK yoon_c_Page_084.tif
611c70fe3143d39977658f553d9fb5f5
72ebae751c20e1e6fba36237f15999f53a1ac8f5
F20101123_AADAVW yoon_c_Page_063.tif
744f4ed856aced3ed60044e29b22bd6b
a5bd6ad19bed7edef9ab837833739583e1831f6a
1829 F20101123_AADBCF yoon_c_Page_071.txt
da8f00326389471d9bd66a1bc2655581
c0d05e4c2f419025d4b5cf171470462c320fce75
1777 F20101123_AADBBQ yoon_c_Page_051.txt
d2509128c31f7d934efd2197f4ea6e81
1131a7c7bd20306d0fdbfa6cd0d6703ac290b9cb
F20101123_AADAWL yoon_c_Page_085.tif
6a18a274cf83a5d9ef0daa1977ba5896
c03aec989f5c477efda3f9722db80d4849ee15e4
F20101123_AADAVX yoon_c_Page_064.tif
5dba22496fc9ccaefe5c3e42e066c1ec
c771c633f358e9fbc809e752fd7f1fd077ae8805
377 F20101123_AADBCG yoon_c_Page_072.txt
8faa6cdcce3e4038302da0832981936c
247ff3556533ebf1cfc7a6cc001f03138ed9afce
F20101123_AADBBR yoon_c_Page_052.txt
3022237b727d709d18b3fb54212dcc1d
8d035190dfe0cfcb24c76db52d57ee778377f412
F20101123_AADAWM yoon_c_Page_086.tif
fcf4ebf1229631e8e02f7c3a015a491d
fc00a8a3e3bd69f5eeda0bb47ca53b9fa9d4927f
F20101123_AADAVY yoon_c_Page_065.tif
255cd5b1116af13e6ce3753712407e1a
bccea2f66a185ab87b4d362f92bef3c3340a75f6
1685 F20101123_AADBCH yoon_c_Page_073.txt
e8dcc829535d054bfec0a377f01ca10d
d3670870bea841e002bd7d8fe6fba72660740e07
F20101123_AADAXA yoon_c_Page_108.tif
a589db0f98fc3c9a4488247e7796b119
e564b5dcb55941483ce1eb8a63d797634c70505c
1573 F20101123_AADBBS yoon_c_Page_054.txt
bedd8141f4906af842a5baaf236ff850
3d9f7edf3d32352377941abd33f56bb2155c5bac
F20101123_AADAWN yoon_c_Page_087.tif
2a9de10a29b29eda2dba06ad2bb84e27
60a8270dadeba8f424a327244453db3e462ef5a9
F20101123_AADAVZ yoon_c_Page_067.tif
35db382f338bf112bade20101458d3f4
689f12966a2950cf7b0559b4dea3e2540ef42d6b
F20101123_AADBCI yoon_c_Page_074.txt
a68dd71a9317ebdc120b90aaf520a66d
44a85f1566f0d197f8887aabcc77d17fd803ddd0
F20101123_AADAXB yoon_c_Page_109.tif
217e4c3da192e846cc88a3e970376f83
b854769e31409d5248f5ea7f63daabcefeb9a390
1858 F20101123_AADBBT yoon_c_Page_055.txt
84620b4089eecc7aaeb5010db62921d8
b2b67ed5bd20e2a20fd419a20f6bf5d54aca6c61
F20101123_AADAWO yoon_c_Page_088.tif
1844353bbb0f9be777363fa3107c7212
a51000a41e416111941298bd1624e2e47e778c82
F20101123_AADAXC yoon_c_Page_110.tif
82f4b8c91aad152a3801d9a976028dd4
8261c7b3693b09eb775a758c072e3fbbefa17bee
1893 F20101123_AADBBU yoon_c_Page_058.txt
6e1d69ff5961f57d28c1b388fa57bcef
3c1106a1c25317cd824cdbdc92495cc6c39771e6
F20101123_AADAWP yoon_c_Page_090.tif
ee58759561de1a5e46daedf1b37ef1ac
02a95f5451f1b8f36924c248ee54624de86036b5
1206 F20101123_AADBCJ yoon_c_Page_075.txt
c97cecff263ffeef89ed3e1361418127
56db02f70b0c9b94ad766bb05f89fe1e999fb044
1120 F20101123_AADAXD yoon_c_Page_002.pro
9189cafffd4e235e792ab3a9f5d0bae5
9f3c4f96e7ca204b05ec69693a83d64d7855384a
1923 F20101123_AADBBV yoon_c_Page_059.txt
2b107c87bb85befe2825697355168d39
b088ae89a71ce2f8e8573a93fd890bd62f253c15
F20101123_AADAWQ yoon_c_Page_093.tif
44dbb37803d8a842e33509e4f2fd7f95
d40038c066e5b2d56996c378bd1b3f21a95efe50
1680 F20101123_AADBCK yoon_c_Page_076.txt
95d42764c6043044be4dc21c699ec902
8bf2096d169f4578a91d43eb6d677bafc75fc4b6
2587 F20101123_AADAXE yoon_c_Page_003.pro
71c49cf3358001cea9a6fd8d1de2e2fd
272f9df8155d9b439cc910d17023200814d49f6c
941 F20101123_AADBBW yoon_c_Page_060.txt
1a54f04e4133a64be0dccdf8c5b9e936
d3ecb3a4f5756397f7d43f21247c66bf96f6c69e
F20101123_AADAWR yoon_c_Page_095.tif
2cd6f10fc7a5321454bc1449d98bef98
1a41dbdeca6e92a4fcb3abaf513602c9a994cc8e
1887 F20101123_AADBDA yoon_c_Page_097.txt
947d07add24208f477e2f21905c05478
9343f41d8abca7efb5cbdb2c1bdec95742de9a58
F20101123_AADBCL yoon_c_Page_078.txt
4254f29868fccd85a9a9b45c19147d55
42df2b2c312e2572e85574dcd7f62576063b0917
19441 F20101123_AADAXF yoon_c_Page_004.pro
b81d0b4ffac49772fa8b842d2e85ab79
8852d40c86edf836bb152b456b2a727119d4a0e1
1469 F20101123_AADBBX yoon_c_Page_061.txt
e3ab1a5886544aa95cebec3d85b95186
80912cea07fef70e642fc415315deab358697417
2047 F20101123_AADBDB yoon_c_Page_098.txt
2d73d5bc7cd68d9ffa0509516ae83c76
5f29117aff006afe36c74b36e23c8fda284618df
1578 F20101123_AADBCM yoon_c_Page_080.txt
b69510c0be657433839905abd1cbf746
3b0528b520f230b7566657505c8041ade687c6d7
72819 F20101123_AADAXG yoon_c_Page_005.pro
ebd63ab37d167d259949dc0856e52e45
57f26ace887ad3575f1bcb175994c70c33b48719
1598 F20101123_AADBBY yoon_c_Page_062.txt
1d36707e01e14681be79288b707f90a8
ad086f45423e0df921d59ce1c008e08f98e38cc2
F20101123_AADAWS yoon_c_Page_096.tif
65a532cc67826e6a4b4fc167ecaf2e93
3ea64c073beddafcdb7aef36768b0fe4d3cc5732
1174 F20101123_AADBDC yoon_c_Page_099.txt
c5e7e7299fd3b7986aa5e5f702625d87
f9c52ef03a5c7ebc2601a68d7516b906ddc971d0
1855 F20101123_AADBCN yoon_c_Page_081.txt
7577b75af51e2c719880457afa3e092d
9a9cda5eccb04b9388cbd28ff3ffdacfe37fa861
98585 F20101123_AADAXH yoon_c_Page_006.pro
7d6eb0035ab1f8ab12ea14bd9f5ca855
541806da96addd158d19b383ae4538372a2200aa
1849 F20101123_AADBBZ yoon_c_Page_064.txt
44218284d2b895997e4362ca81a6287c
9ad018384cfd9f267a12357178de8985fa079a3c
F20101123_AADAWT yoon_c_Page_097.tif
d8d54783e81143cc6537086e8344b319
43253bb7c3624e2eacad6d82f79164e3eb826fa1
1844 F20101123_AADBDD yoon_c_Page_100.txt
c25eba0a18ba6678dc1d85be9ce1d0a4
86fbb32394e20df660d2d45895dc77504b8e69a5
1616 F20101123_AADBCO yoon_c_Page_083.txt
63fa438808b047286b01bd4fa9563057
1385fcf069bb41eba5e4b42cc2fbf2f933360b58
56829 F20101123_AADAXI yoon_c_Page_009.pro
265215d1a3599c9cd7e7096a8aad66ac
bebeed5e1fd6c062fc2d5ce1d13353717f1a9651
F20101123_AADAWU yoon_c_Page_101.tif
e496adab52c88b455b10a362316cd34b
3a64e7cf9ec67a89545391aace19a7a2c96f0a2d
2057 F20101123_AADBDE yoon_c_Page_101.txt
48c35d0534a34b7257d17eaa0c342671
a1ee87452c4ae8ecac19c87104ec52276be121f5
1114 F20101123_AADBCP yoon_c_Page_084.txt
036c40758451ce55d18ea7d1c10a3791
ce9433188123ff6e6324aee0ecb26c458b73d8ae
54636 F20101123_AADAXJ yoon_c_Page_014.pro
0691326e51ab9669ebd2e4d62dc1b8fb
a2153e75cb6b19e47ab13a2c9f11a9bdb275e0fe
F20101123_AADAWV yoon_c_Page_102.tif
76886cba179c0f12334806b8713f6d70
781afcd744ad35930a1b6e9a1cce28b9a92494da
1626 F20101123_AADBDF yoon_c_Page_102.txt
867c23dcd0e8b6735a2c056e51427207
24563de1f5523c0f941e7546b111897a035634e1
1479 F20101123_AADBCQ yoon_c_Page_085.txt
90543071b7ea800602c6663b66a9f536
194eb2273ae09b846b848bee586ea660db731619
43508 F20101123_AADAXK yoon_c_Page_015.pro
ed8593bd02a2d18210d6ce3cdd2653a9
a4320f351a04b1e00a7e6575dc83138cb46dfae8
F20101123_AADAWW yoon_c_Page_103.tif
6fb02aee0989e6bf9ebe9516bbc6a92d
3813462db2ad7503384106908c004b27a9ede90f
1666 F20101123_AADBDG yoon_c_Page_103.txt
df90d93a7f4bcdffcea20622f4db95be
e265e985ed8397a281094917940ac827d49c26d2
1771 F20101123_AADBCR yoon_c_Page_086.txt
c3a39a29fd94201cd4640ec2a690afb1
014a51e05c388dfd999473e1c75f3ded49a9a9ac
25593 F20101123_AADAXL yoon_c_Page_016.pro
63963d2dc3ced02170040fc0dd9789b0
506d683ac4514426162dd4adf8facc1a4c3c8738
F20101123_AADAWX yoon_c_Page_104.tif
2c0e830713df2bf5905a322da6a0dfea
64f0ff32e6a4c4e39b633bb1751f31b43cfb6d93
1646 F20101123_AADBDH yoon_c_Page_104.txt
691b3b810290a601c298d72da76a21fc
c8e74273b98d141aff2e93c36e4ad304f4858017
38518 F20101123_AADAYA yoon_c_Page_037.pro
480e937e26dc5e1110b10645768431af
50150126c16732d333895a34d35ed1711ddf5901
691 F20101123_AADBCS yoon_c_Page_087.txt
f0db509fba808b1e75521ccf5aef9851
1536f20fb701a64730eebec4d506c26990a2e363
36494 F20101123_AADAXM yoon_c_Page_017.pro
9e9a3a2d3cbcc9bda8190d81a46e5360
2cca5e9e16008d17b98c84649f8d7c4c4fd94470
F20101123_AADAWY yoon_c_Page_105.tif
31798288c9e10b9211b92c1c01770b65
375f0c6176eee8cb9c6f6738942acc8904277b01
1991 F20101123_AADBDI yoon_c_Page_105.txt
43eb95b1844ba7a2bb63fb02aaef407d
5c16d4f1e49ea934c44f31d063409a17892a9e49
43435 F20101123_AADAYB yoon_c_Page_038.pro
59fffb8f53061e7d8650788bd014336f
ce42a661e71eee540be84cd7b6a9f822714946ae
794 F20101123_AADBCT yoon_c_Page_088.txt
447e8a9b8cd1716c9d49799476787eda
e1f0bc8959d865291d23a555777e68bb81b0f855
41187 F20101123_AADAXN yoon_c_Page_018.pro
37fc284397189e98288d3cc348423f39
bc20e96d1c1ba1a27f60bf933c27ae5d696618bf
F20101123_AADAWZ yoon_c_Page_107.tif
54b678f7b6be3d0718df7e5efb5cba9c
5f6e755013a459dde7959d9a0ac957248fa8f206
2492 F20101123_AADBDJ yoon_c_Page_107.txt
127299dbb7fbbc723b18f9006ef0f9c0
8edf9b5ca23db5395a716370cc3505e41eaa5286
47296 F20101123_AADAYC yoon_c_Page_039.pro
6bb862045b8b00cf147d5fe9466c3dde
9c22856222d79368c38177e35c107ae32e14db9d
1750 F20101123_AADBCU yoon_c_Page_089.txt
653c00ca0157c687bad417d057484fff
a83bb3815e276964c3f869dbd3ac6262ba0975fa
36656 F20101123_AADAXO yoon_c_Page_020.pro
235ad5691c48fb679baf1924c860f53e
0f1ef6d30796b00e02bb028235cd5b52b440e143
39553 F20101123_AADAYD yoon_c_Page_040.pro
3b230ae1a863a72e360d4a9aceabc389
c3d642a6e2a07895d3a75e2019449102b2027890
1860 F20101123_AADBCV yoon_c_Page_090.txt
ab2f9f9e471dbf4febac37a89149ec13
54a3574b657917827fff2ca7a99b243b17c7cdc0
29313 F20101123_AADAXP yoon_c_Page_021.pro
d95fdb9bc9742e94f792b7c1ef0c506a
bd8cf1a48b63165521ebba29d3ee8a8159c9bcac
2403 F20101123_AADBDK yoon_c_Page_108.txt
b9c9cae6bafc2fcff1253519395b82ad
a12d060b9f9061bcc25cc13fc9e38fe895f86951
31917 F20101123_AADAYE yoon_c_Page_041.pro
4c37c3a801849916f14d4721b301508c
46eb4030838a10a170d376ed6847202fc1586c6b
675 F20101123_AADBCW yoon_c_Page_091.txt
4ff05306dc0941c35e837e1f3799d146
c63b2227c62b374f7a098b5848d30de66a72fdde
40342 F20101123_AADAXQ yoon_c_Page_022.pro
a0e40816b110d6e096f2f4d07e4fa8f2
182e6b7a85b9050da65aadaa61dab05277cb1315
2475 F20101123_AADBDL yoon_c_Page_109.txt
4ae4152225c5c34f8d4d82738473ebbc
6f9beb269d2275392319dc703bf49e76212ff10c
34832 F20101123_AADAYF yoon_c_Page_042.pro
b1e33597533b98321b9595db04613791
a18de1af9293a496df1dcfe3b9bd79ac1d889b48
1092 F20101123_AADBCX yoon_c_Page_092.txt
6acb01ecb42ea373464bbc2e29694681
66366e2ac1e63f16423d994ba940ae76f346e154
50509 F20101123_AADAXR yoon_c_Page_026.pro
ab822303dcbbe6970067826be96ee3f5
3aeb016070ab0436fad6d074ad267baaa8b0dfe0
5225 F20101123_AADBEA yoon_c_Page_009thm.jpg
ecab888f9fb0656ab153e1750137aa57
0f2b59f26181b46574a6443df844926eb80ecee1
2378 F20101123_AADBDM yoon_c_Page_001thm.jpg
e29c6ec127ce11928194990462d097cb
8b435a1a30d2bbadaaa7b55391429cfd8f484193
46830 F20101123_AADAYG yoon_c_Page_044.pro
87d11f93eab3c466a30c802526adda31
5582df142c7bf8fe1f7b422c5a8126cedb48d769
943 F20101123_AADBCY yoon_c_Page_093.txt
fe04c0e5f19a8d13835084752fe4025b
a1b94994639861e9cc24788d619592eb5be77fa9
50901 F20101123_AADAXS yoon_c_Page_027.pro
ca8964425eea2be28a60e054861045ef
45c6339b6ff6bbda91ef62601349d552565ce4d7
17876 F20101123_AADBEB yoon_c_Page_010.QC.jpg
858dc89fe3a0872d709d33984717b2fa
5a802028e0cbd7ec360636d28c0ef9922ab83f1c
1808167 F20101123_AADBDN yoon_c.pdf
41e109df95b34216cbfbd9734471590e
da0628ee4f03a76757e5efbe7ac06d6ea600c845
44181 F20101123_AADAYH yoon_c_Page_046.pro
a3235535e35b16dd465df8de1b4bb328
6a725dff811be94ec21b110f810dc86ae2b1322d
1755 F20101123_AADBCZ yoon_c_Page_094.txt
4f3b7fdaa0078623d1ee6d768ff65c98
6e39fe6f3d93b2aa17260ebf0479f8ba84235ae6
40691 F20101123_AADAXT yoon_c_Page_028.pro
520782c35fa03d8e89f2c73bd73dd769
8505fbe8101254f3e5d3c18042bd0d5c70c33b97
18469 F20101123_AADBEC yoon_c_Page_011.QC.jpg
edd8f389c334683ed7485dfec7e56368
69b45bf5dc8df6db026d2d1cb3da02e39b0d45d8
7523 F20101123_AADBDO yoon_c_Page_001.QC.jpg
6fe6b029258ca64211f76d187635dc59
c4a25e71502eae3895292ac498c97b6de663b5df
27911 F20101123_AADAYI yoon_c_Page_047.pro
cfae3989a2cc45ba2eba7189bc8eb42f
7d5208c097762f661abb60cba9499375680173c9
32435 F20101123_AADAXU yoon_c_Page_029.pro
b31fcfec5d0277e4a0c22d6a2fcd2b8b
b598c99dab915a7b3d8ff83b173adf176b176a4a
1986 F20101123_AADBED yoon_c_Page_012thm.jpg
497cb09802a4fae3ac044da1b3042851
16a280bcbb6c3172c4ebceeb1cbe14aee9cf16ca
41402 F20101123_AADAYJ yoon_c_Page_049.pro
22035003d3471b9a4f37b6b3228ec499
5f28d4aaa44ad6ddc83d3ffcc9a13ca583d5c36c
43663 F20101123_AADAXV yoon_c_Page_030.pro
be1d66434187a3e8e38ce8604122c7a9
a7a312c7142f12a111abf671512d4f6ae616f0f2
18793 F20101123_AADBEE yoon_c_Page_013.QC.jpg
f2fe1831fddd2f15b6c4e35259e4aa22
17ac854df48de2f0e2826d4a8b284e74cc6bab5b
1396 F20101123_AADBDP yoon_c_Page_002thm.jpg
7be95faab18db1acc7e74eef501b6a32
5ad27ff6d8c298b7a428010688d69de97f401299
46752 F20101123_AADAYK yoon_c_Page_050.pro
b294899dca4c9e068a02b597acfbef90
6290155eb9ca72a23f6bc167dfb606239dc840dd
47369 F20101123_AADAXW yoon_c_Page_032.pro
1bba92aa45acbb5eb9087959cc33126b
38c37c2f276098c0c368a40859ca79c9971230ea
5327 F20101123_AADBEF yoon_c_Page_013thm.jpg
2fc6eb65613b9f007fb41f4254282998
ac17c792487ebc79a910745ed4943821c1b010f0
3632 F20101123_AADBDQ yoon_c_Page_003.QC.jpg
fd5cb5ed50c114d36ab9cae5657bc94d
1f277aa6625a270182feb128b14b52e5bee85119
44624 F20101123_AADAYL yoon_c_Page_051.pro
7f6a02605267076532a29baf380c6302
54b818dc78db3e0bc0db82823ec6da2a35330955
47951 F20101123_AADAXX yoon_c_Page_033.pro
3e409d743fc69a8b4905db98bc2ddda2
1efc83c70eed608a1d57a33124e5d8a3154e0950
6671 F20101123_AADBEG yoon_c_Page_014thm.jpg
09559b2266146ef696ba304b03ba385e
be33d6bc1bfb1b83675e87263b8d27d0efb8e19f
43793 F20101123_AADAZA yoon_c_Page_071.pro
194f7bd6eeaf3e85726bb5efe10e79de
118e91b9b48d912a4bb4bb40279c8c479e67242e
1545 F20101123_AADBDR yoon_c_Page_003thm.jpg
4df4285af7c44f3715ecbe24b40cc9db
c54be38bf339511ea8b823e0a11f506ac04aa3d1
43104 F20101123_AADAYM yoon_c_Page_052.pro
3918cfd2e41f80e5ffe86e9142e6f093
81960623ac67701a232739036a4e55cb8d9d45b0
48077 F20101123_AADAXY yoon_c_Page_034.pro
3cd453828d47db8778444c0750cdbce2
b506dbecbd0ee487ea4239ac398200ff0179d2b1
5992 F20101123_AADBEH yoon_c_Page_015thm.jpg
dfa4ad65b3dbc20b28eaf58e91c2d687
f0ed66c3dac127c9a386ff674a3b42f04d933bd6
11623 F20101123_AADBDS yoon_c_Page_004.QC.jpg
483bd43128ec72412c3f35dde768f4b5
7aa6d43a3483552aea1e5b01de2e72c58a8b79b5
8698 F20101123_AADAYN yoon_c_Page_053.pro
26e01863e92834980982dfbbbaa96c29
74de078fc63a6061b79ca7a601778d93d218cbfc
13538 F20101123_AADAXZ yoon_c_Page_036.pro
3616f02795848177309a276f303ef784
044c04c943b8c98de00a15e31ddf25672dd0d22a
16384 F20101123_AADBEI yoon_c_Page_017.QC.jpg
016b0159a8c22aae8a3b8482a3afbf9c
f636ce8a67d402c96280621922c6f7554249b5ad
36741 F20101123_AADAZB yoon_c_Page_073.pro
d73c3fd861aaccb7e39ce8aa5d1c1a9e
42d7367f4d813e2b17bade4748bf83c6db99f987
18290 F20101123_AADBDT yoon_c_Page_005.QC.jpg
25c75e7f7adebbc5f0c8788f649143e4
3e194d15178c65946562be22044c6bd1658c12e9
28339 F20101123_AADAYO yoon_c_Page_054.pro
aac7ebacfd70d70efe01dc278bfd6df3
6df785e458a37ee7f336ec12f9f1a19e093b7453
4573 F20101123_AADBEJ yoon_c_Page_017thm.jpg
fafbe052f2670d23dc8cc633963c6021
a18f27d8281881f0cfb1e64380309a8474b26b6f
38829 F20101123_AADAZC yoon_c_Page_074.pro
368d64ea43aa3849b7d98c269c599b1e
ee0d7a1ca0b8188133b6e44a97ae9c5ef4cef8b7
4688 F20101123_AADBDU yoon_c_Page_005thm.jpg
6af9d066e54bf1df335b6869d6872113
329b3d32f17efd89f6383aeaa434535c63b0c6dc
37106 F20101123_AADAYP yoon_c_Page_056.pro
346dcdfbf2151596776f624e484e0c06
c080757ad1dc81025fb4f64cdbd06b757e97f841
5500 F20101123_AADBEK yoon_c_Page_018thm.jpg
9b70a1f410de51c4870da3e0e6aa7fb2
4f5ed211fd5b2a5cc8bcbb742d5ebc845e1a07d6
26687 F20101123_AADAZD yoon_c_Page_075.pro
395784c8d2257b84b550c8c3ca26b92e
f3c65ca03f8402c262e4f04799ce24480ed3ac6f
27015 F20101123_AADBDV yoon_c_Page_006.QC.jpg
43b13fdc4f49dd232fb93b35773884f0
a784bbf7695a5cebc115e9f8a9363763120e03d8
47605 F20101123_AADAYQ yoon_c_Page_058.pro
a97828b57b2020bd5d0faad7e8f8af19
647337e23ec8aa2491e7145504073448ed8edc3b
33000 F20101123_AADAZE yoon_c_Page_076.pro
ebe70d35d81e47076f4d179a94acf764
00aba94fd92c089f68b96448f53f03f154d37c83
3315 F20101123_AADBDW yoon_c_Page_007thm.jpg
319c99d93ecc60de92dd4e5d5118df7e
2b1f859d1af80f1ad3f4ff7403a93af129e8c2ad
34207 F20101123_AADAYR yoon_c_Page_061.pro
59b4c030225228d8fe16208185002b7f
49afbc1b694389aeb8b86e2f4de6597679deeed0
F20101123_AADBFA yoon_c_Page_028.QC.jpg
867dfee960281903e01c163840592667
d67821c478a9133a84af20cc856777a9842ecc52
22424 F20101123_AADBEL yoon_c_Page_019.QC.jpg
d081d1e93c045b6898a938d8c8f8ce14
d98ccd55cd7122459b4d7e40432df8be578543e5
28720 F20101123_AADAZF yoon_c_Page_079.pro
f4aee26702160fa6c1829dbe8876c7f8
bf46a67ed7d74c96a04228921b97865b6f188692
13840 F20101123_AADBDX yoon_c_Page_008.QC.jpg
8f69bdc1424e4eff2d070e984f2221ac
0e9f54e75f36caa5b71ff9783958406a329973cc
35597 F20101123_AADAYS yoon_c_Page_062.pro
d75fe1d941d7f717d2e07a7571ec6d09
f6cbf793734a1f11cabbd16d7d72486af3be2925
5914 F20101123_AADBFB yoon_c_Page_028thm.jpg
89ed93cfa7aee906f7859c8715301831
fdd50ad6dfeeb826d88c5ff4f07780a05f63f86e
6320 F20101123_AADBEM yoon_c_Page_019thm.jpg
99ea3face0623c9c3fe242853f05932e
db7794134f17e0b7144539d58f4d77b53e9be2c3
28767 F20101123_AADAZG yoon_c_Page_080.pro
9f5f83a7f2734284e847478066bc2a83
d501b9f08776b244ff552ec49fe92cdcab58d7d7
4014 F20101123_AADBDY yoon_c_Page_008thm.jpg
7bc3dcfd5d769e473a56d349e324e140
45c774872da83b4ee5c7a06bd69dbad104de20c4
17880 F20101123_AADAYT yoon_c_Page_063.pro
d6c2fd4896e57c86b651447c5abda6ea
25d1d37d77a6571dc3f9f6fb1544f3d1337e0373
16691 F20101123_AADBFC yoon_c_Page_029.QC.jpg
ba73706b1cb75dbcd9770602b1e75faa
8d129e47358f33de286450a266857867c48dea55
5490 F20101123_AADBEN yoon_c_Page_020thm.jpg
c7407433186472fc14d20db6977fe82d
a2537a79e55e628ccabd75fe742d760274fe9762
39451 F20101123_AADAZH yoon_c_Page_081.pro
ecf34044ff665a71e21a651f06fe7710
6bd38b98451b28fb371e348d1dcf1120d336c4ba
18965 F20101123_AADBDZ yoon_c_Page_009.QC.jpg
e3874d6999889bcbf3de2e96f22e9fda
e7534135ba78cb3960758220ba3d4a26c0c4a16c
46041 F20101123_AADAYU yoon_c_Page_064.pro
a660221ea402c6d953566ad8215836fe
c893ef0312298b61a4bc632ea0b9ca3c0c53b4cf
20853 F20101123_AADBFD yoon_c_Page_030.QC.jpg
08ac205290a3a22ae3a252dcf6c33052
c65de1f50ab94e16ee448d502aca6a9ca7b5090a
15883 F20101123_AADBEO yoon_c_Page_021.QC.jpg
381af89cb045dc662acce3513be6faa4
93df798b33b662ce4645c45b6c560789a1aad13b
32378 F20101123_AADAZI yoon_c_Page_085.pro
a3e3a3fb0b488755dd2a28ab98f1dddf
015ccfa83e659a7915197c3568f410d8137613bc
36305 F20101123_AADAYV yoon_c_Page_065.pro
d5243efa750701aa15e17b190c6b6915
4e755a87f3a9abd7befbcd2e9b964857dd96d9e7
5826 F20101123_AADBFE yoon_c_Page_030thm.jpg
5bdf79975372f31e3e53d30e9b49b432
dc661710638c57a176d6c9a15e80faefa89fd871
5102 F20101123_AADBEP yoon_c_Page_021thm.jpg
1a1bad57119f90751ab0dfb5a88ec418
3aaf7f2126db2cf85dba78b0aa91c9ae1cb222c7
41231 F20101123_AADAZJ yoon_c_Page_086.pro
e03ecb230ee656582dd3102e53f9844d
6d570c3b2cd0e3460a66b1c92d111c949579fcd4
27473 F20101123_AADAYW yoon_c_Page_066.pro
c646f389055e82932916788a2ead4844
ea44558f56b2066a78be26f4c82690a5e5d16efc
F20101123_AADBFF yoon_c_Page_031.QC.jpg
950403c726bca8247e2bd868655e5525
11a18e40b8d75ee6b07828d52e3026dcf38a200d
19516 F20101123_AADBEQ yoon_c_Page_022.QC.jpg
660cbe02786c950cf0205d0e909fe812
d96cd6c4465004ffa0605a271fb30edd387a6417
14247 F20101123_AADAZK yoon_c_Page_087.pro
0e2ddf0e66f60165b05b0c706bb84ed4
9da1df29248dc60d5ab22c7c36172aa30bd3c2d7
15988 F20101123_AADAYX yoon_c_Page_067.pro
f3d7f89cfc0b8b29bf3d980c927b19e3
ec18ee8e93b109d66135cc21056e2fb801dbad75
6033 F20101123_AADBFG yoon_c_Page_031thm.jpg
c4293f4a1ff7f089bf30efbdf0297935
77506948e47a3680e3aa78d996e58ad19986f951
5555 F20101123_AADBER yoon_c_Page_022thm.jpg
3775e6d0297dc2daec22aca9b041aafd
37e7e5469a78eb4e880d0a4abefea424c6cdef3f
15270 F20101123_AADAZL yoon_c_Page_088.pro
5097630b682d332296d139e440543082
42e1b621823ecead765cf3f887570551c1d87bb6
46370 F20101123_AADAYY yoon_c_Page_068.pro
8ea6efc02a19516a9a46a77e10a36794
a7bb9d466db8c7f94d87476474b8363a5bf5ec63
22981 F20101123_AADBFH yoon_c_Page_032.QC.jpg
f7da882d0b1bf6a78ba0f6949d5c62b9
54d3e507139c47ffd61ee9822c13bf799c97b212
6343 F20101123_AADBES yoon_c_Page_023thm.jpg
b41b63ab9c58eb80b5d1e85e81f890b1
2f638d1b2146110b65a70cca65e73e1837a12f08
43806 F20101123_AADAZM yoon_c_Page_089.pro
f4b3b62b87bda20cf453ac230d744083
01cc97a6196c40e3432262cc533ae98fac02c603
38994 F20101123_AADAYZ yoon_c_Page_069.pro
6ed37e9fbf73b547069e73123777d2f6
b7635e024ae1ff0cd663f8d1c529903d4fc159f0
6223 F20101123_AADBFI yoon_c_Page_032thm.jpg
3794ef41ab5e7ceb85faf0edff642701
29b92b4cd093894d1e3801f300c1b489395ca5cd
22567 F20101123_AADBET yoon_c_Page_024.QC.jpg
ff9fc39c8b020af5707d4bcfed556b1d
e3ab36b1edfdb14c876fccd0a81d990fac9b9148
43338 F20101123_AADAZN yoon_c_Page_090.pro
2513822df590b0c0d7b0a3da1b5f0505
a1c43e3c871592555330fa185adda54b735b4579
23079 F20101123_AADBFJ yoon_c_Page_033.QC.jpg
a7c9e8ce8b6fce4bda2783d1a3f9ed58
2ce29d9fc3387a606c561ba2eff78f57b3e6284b
22746 F20101123_AADBEU yoon_c_Page_025.QC.jpg
19f85e38e7ac6dbd6cc72694ea5ca9f3
454caabb107a8231679410c9181e6aa1934a0595
13502 F20101123_AADAZO yoon_c_Page_091.pro
c531e7ed54ed0458d94798a6c08b57fd
b4e79bfccfee788ff246935ba8cb06b3d6043042
F20101123_AADBFK yoon_c_Page_033thm.jpg
607a0f36634f2cc89cff606c29552277
5926ebcfc7ae96071c077321b235ce928e4d3e34
6483 F20101123_AADBEV yoon_c_Page_025thm.jpg
234daf1377675892021afd07469caa6b
5ac09bf9a1ccd76b38bc93d62d7f52c72a1892b5
25285 F20101123_AADAZP yoon_c_Page_092.pro
aa061aeda12b6e3c777bda222fe165f2
96976ea7218cfee037f939806e6a05dc84368d95
6355 F20101123_AADBFL yoon_c_Page_034thm.jpg
159643c60b9e9cba13c39a3c93a1ff84
1bd2085dcf4fed483aad4d620877376d8dbfe10d
23302 F20101123_AADBEW yoon_c_Page_026.QC.jpg
42d3e1576916db909af84ed05b4093ef
497b4bf34c9ba7587d3fd85cb9038250449f3e5b
13537 F20101123_AADAZQ yoon_c_Page_095.pro
ba666f0e4c0af339a948c2607a739eed
38eddb941c66c5c6186bc6178f02e043bceda60f
6573 F20101123_AADBEX yoon_c_Page_026thm.jpg
95c324ece41b634ad09f15797d738e14
7abc3bf8b8bd59686ac65c459b9f6a7d976f710d
47881 F20101123_AADAZR yoon_c_Page_097.pro
665418858354f8d959af202dc5c0dc38
80bfb0da3da1d201910bbe87c040d6031bb69e9b
5436 F20101123_AADBGA yoon_c_Page_043thm.jpg
86a10c476bbcff4aa98ee46dd18e71fc
be81ab8992b7b4bc2747b96fcb358519a4cdd289
6489 F20101123_AADBFM yoon_c_Page_035thm.jpg
de92efe291182975d9bc08c9f71e5b12
aa05597ae3ef59f6f1cf1dcadfcdb72c9c5067bb
23503 F20101123_AADBEY yoon_c_Page_027.QC.jpg
fbe215a01f7f274d3f18d90648ae62d5
a9b40879322e4468b2a545573f2395bb5d02d5f3
21724 F20101123_AADAZS yoon_c_Page_099.pro
7247dbab18b84edf631440827ef6e38d
fded102152b11a10d9798ed42259ab8d6f1404b7
6254 F20101123_AADBGB yoon_c_Page_044thm.jpg
707b4366c6f8350c492466dde8503f0e
9b3d08fc436797feff3b0e766fe5b142fd0a06fa
8578 F20101123_AADBFN yoon_c_Page_036.QC.jpg
a0772a4e3fe134ad9e2bd44aafa333ad
0c3322a26f2a822c3d9ec87d4036d268e3deb53c
6727 F20101123_AADBEZ yoon_c_Page_027thm.jpg
32ca735efb55e689c752261465661cf9
6672ad7bdf1d866507fbb1895cc59509a31267d6
41850 F20101123_AADAZT yoon_c_Page_100.pro
7d3c6e145bed8f9f17273b6142cd0ca1
15a7b202cb782ea50a41b33ba5549f082bdfb613
17247 F20101123_AADBGC yoon_c_Page_045.QC.jpg
563399959fe30059e05dcd01265faf0d
ad8303e2b37e17c1a7a2f18904cf3ac5680abf88
2799 F20101123_AADBFO yoon_c_Page_036thm.jpg
19da213b029c23720611f93a9ac681cb
9d5eecde69676e48b41bace0efb3e99a9ed2da47
37262 F20101123_AADAZU yoon_c_Page_102.pro
bc8b48b7a16dc3fb4ca3618f14805e88
813bdfd0d0fb98a340f7b64efeb99d74c4fd3bb5
5028 F20101123_AADBGD yoon_c_Page_045thm.jpg
fbf6857cd4c678cc55211bfefe629428
a85af82ebb4db6b12689f953f901c3af2bf97be2
5492 F20101123_AADBFP yoon_c_Page_037thm.jpg
4435d32b883f1ef9934bc9037443801e
50f3ca3014ecaec9a288cefb43bad4d606a339a7
37812 F20101123_AADAZV yoon_c_Page_103.pro
09c0d083a74e5ea69bc9fd97be2d4c92
e05b190d692f3d3cc9b35b20550e9dfcf7e4f7d6
22872 F20101123_AADBGE yoon_c_Page_046.QC.jpg
1a14a4b8bc28ae0ccf4ab50e7ab90d39
5c9f747112f5d6fdf062a06bfa6749d787a66e8b
20386 F20101123_AADBFQ yoon_c_Page_038.QC.jpg
9bd836deee63f51e3b79ce2a6bd3cd78
e0c56d70855095c6e81f2eef347e0c6bbf1769cf
37749 F20101123_AADAZW yoon_c_Page_104.pro
2253a2d54585e24e214b2c934d9b41b9
4d9e51b861b9deeb740bf259c2aaa06eab2b2df5
6424 F20101123_AADBGF yoon_c_Page_046thm.jpg
930b6d2b00e3c07bff1cc6271e3bdd19
75ac9cbe5726f44714f22fe2be28675565ee7311
5614 F20101123_AADBFR yoon_c_Page_038thm.jpg
77468866a51a2d67d4545764b9418df7
e2175aa3182c90a5f41da2099b25fdf007f70b42
51841 F20101123_AADAZX yoon_c_Page_106.pro
645aa518f699c24102b96750bc44cdf6
87fead81d2037934a0b401b8c60e698502c95352
15374 F20101123_AADBGG yoon_c_Page_047.QC.jpg
85b120b201e75b95b74f4e504e8a50c0
5c615e0a638a62e187dc1b2aff68a4d9b75792cf
21798 F20101123_AADBFS yoon_c_Page_039.QC.jpg
fbc43895fdb3117b2a9a4288225c2d95
27143da7e4a1ce491712e90547fa09158db262ef
61755 F20101123_AADAZY yoon_c_Page_107.pro
39da0100760a466b3dbe8cb11da4ef8e
7e2dba5405157ef4021a570716c4f1a75092e644
4875 F20101123_AADBGH yoon_c_Page_047thm.jpg
a212e0ccd93d9773c86213b852cb797e
07e89fa8f251a4bc835f3db1a12df20ae2d67fb5



PAGE 1

DOMAIN-SPECIFIC KNOWLEDGE-BASED INFORMATION RETRIEVAL MODEL USING KNOWLEDGE REDUCTION By CHANGWOO YOON A DISSERTATION PRESENTED TO THE GRADUATE SCHOOL OF THE UNIVERSITY OF FLORIDA IN PARTIAL FULFILLMENT OF THE REQUIREMENTS FOR THE DEGREE OF DOCTOR OF PHILOSOPHY UNIVERSITY OF FLORIDA 2005

PAGE 2

Copyright 2005 by Changwoo Yoon

PAGE 3

To my wife Jaesook, my daughter Jenny, my son Juhyung, and my families, in God with love

PAGE 4

ACKNOWLEDGMENTS I would like to thank my parents for their support. They have provided unconditional love and support. I greatly thank to all my relatives for their lovely concerns and prayer. I would also like to thank to William H. Donnelly for his support and beloved care during my Ph.D. Without his support as a research assistantship; I would not have continued my graduate work. I would like to thank my supervisory committee chair Douglas D. Dankel for his guidance and excellent advice on research. Finally, and most of all I express my gratitude to my beloved wife, Jaesook. Her love, support, and prayer have not wavered in this lengthy process. She has undoubtedly been the single most integral component to my success. iv

PAGE 5

TABLE OF CONTENTS page ACKNOWLEDGMENTS .................................................................................................iv LIST OF TABLES ...........................................................................................................viii LIST OF FIGURES ...........................................................................................................ix ABSTRACT .....................................................................................................................xix CHAPTER 1 INTRODUCTION........................................................................................................1 1.1 Background about Intelligent Information Retrieval.............................................1 1.2 Intelligent Information Retrieval Model................................................................3 2 INFORMATION RETRIEVAL...................................................................................6 2.1 Classical Information Retrieval Models................................................................6 2.1.1 Boolean Model............................................................................................6 2.1.2 Vector Space Model....................................................................................7 2.1.3 Probabilistic Model.....................................................................................9 2.2 Alternative Information Retrieval Models...........................................................10 2.2.1 Latent Semantic Indexing (LSI)................................................................11 2.2.2 Lateral Thinking in Information Retrieval................................................12 2.3 Information Retrieval Models Involving Reasoning...........................................14 2.4 Evaluating Information Retrieval Performance...................................................15 2.5 Useful Techniques...............................................................................................17 2.5.1 Stopword Removal....................................................................................18 2.5.2 Stemming...................................................................................................18 2.5.3 Passage Retrieval.......................................................................................19 2.5.4 Query Expansion.......................................................................................19 2.5.5 Using Phrase..............................................................................................20 2.6 Enhancement of IR Through Given Knowledge.................................................21 2.6.1 Using WordNet..........................................................................................21 2.6.2 Using UMLS, SNOMED...........................................................................23 2.7 Summary..............................................................................................................23 v

PAGE 6

3 KNOWLEDGE REPRESENTATION BY BAYESIAN NETWORK......................25 3.1 Semantic Networks..............................................................................................25 3.2 Probability Principles and Calculus.....................................................................27 3.3 Bayesian network.................................................................................................30 3.4 Noisy-OR: Bayesian network inference..............................................................33 3.5 QMR-DT model...................................................................................................35 3.6 Bayesian Classifiers.............................................................................................37 3.6.1 Nave Bayes...............................................................................................38 3.6.2 Selective Nave Bayes...............................................................................39 3.6.3 Seminave Bayes.......................................................................................39 3.6.4 Tree Augmented Nave Bayes...................................................................39 3.6.5 Finite Mixture (FM) model.......................................................................40 3.7 Summary..............................................................................................................40 4 KNOWLEDGE-BASED INFORMATION RETRIEVAL MODEL ARCHITECTURE......................................................................................................42 4.1 SNOMED............................................................................................................44 4.2 Anatomic Pathology Database (APDB) Design and Development.....................46 4.2.1 Metadata Set Definition.............................................................................46 4.2.2 Information Processing: Retrieval and Extraction....................................47 4.3 Summary..............................................................................................................47 5 KNOWLEDGE-BASE MANAGEMENT ENGINE.................................................49 5.1 Semantic Network Knowledge Base Model Representing SNOMED................49 5.2 Classification of the Post-Coordinated Knowledge.............................................52 5.2.1 Statistics of Pathology Patient Report Documents Space.........................52 5.2.2 Classification of Post-Coordinated Knowledge........................................53 5.3 Statistical Model of the Post-Coordinated Knowledge.......................................56 5.4 Nave Bayes Model of Post-Coordinated Knowledge.........................................56 5.5 Summary..............................................................................................................59 6 KNOWLEDGE CONVERSION ENGINE (KCE)....................................................61 6.1 Support Vector Machine Document Vector........................................................61 6.2 Conceptual Document Vector..............................................................................62 6.3 KCE: Knowledge Reduction...............................................................................63 6.4 KCE: Conversion of Pre-Coordinated Knowledge..............................................64 6.5 KCE: Generating the Conceptual Document Vector...........................................65 6.6 KCE: Conversion of the Post-Coordinated Knowledge......................................66 6.6.1 Statistical Model of Post-Coordinated Knowledge...................................66 6.6.2 Probabilistic Model of Post-Coordinated Knowledge...............................69 6.6 SVM IR Engine: Document Retrieval.................................................................71 6.7 Summary..............................................................................................................72 vi

PAGE 7

7 PERFORMANCE EVALUATION............................................................................73 7.1 Simulation Parameters.........................................................................................73 7.2 Simulation Result.................................................................................................74 7.2.1 Performance Evaluation with Pre-Coordinated Knowledge.....................74 7.2.2 Performance Evaluation with Nave Bayes Post-Coordinated Knowledge.......................................................................................................78 7.2.3 Performance of Statistical Post-Coordinate Knowledge Model................80 7.3 Summary..............................................................................................................80 8 CONCLUSION...........................................................................................................82 8.1 Contributions.......................................................................................................82 8.2 Future Work.........................................................................................................84 APPENDIX A PRIMARY TERMS WHICH ARE THE BASIS FOR THE DB ATTRIBUTE........86 B SNOMED STATISTICS............................................................................................88 LIST OF REFERENCES...................................................................................................94 BIOGRAPHICAL SKETCH.............................................................................................99 vii

PAGE 8

LIST OF TABLES Table page 5-1 Number of AP data each year from to .........................................................53 5-2 Number of unique SNOMED axes equations..........................................................53 5-3 Relation statistics among axes..................................................................................54 5-4 Statistics on post-coordinated knowledge................................................................55 7-1 Relevancy check result of 261 simulation documents.............................................74 7-2 Value of performance gain of pre-coordinated knowledge compared to VSM.......78 7-3 Value of performance gain of post-coordinated knowledge....................................78 A-1 Primary terms for APDB..........................................................................................86 B-1 Partial list of T code.................................................................................................88 B-2 Partial list of M code................................................................................................89 B-3 Partial list of E code.................................................................................................90 B-4 Partial list of F code.................................................................................................91 B-5 Partial list of D code.................................................................................................92 B-6 Partial list of P code.................................................................................................93 viii

PAGE 9

LIST OF FIGURES Figure page 1-1 Knowledge-based information retrieval model..........................................................4 2-1 Vector Space Model example diagram......................................................................9 2-2 Recall rate and precision..........................................................................................16 2-3 Relationship between recall and precision...............................................................17 3-1 Example of the probability for combined evidence.................................................30 3-2 Forward serial connection Bayesian network example............................................31 3-3 Diverging connection Bayesian network example...................................................31 3-4 Converging connection Bayesian network example................................................31 3-5 Example of chain rule..............................................................................................33 3-6 Example of Noisy-OR..............................................................................................34 3-7 General architecture of noisy-OR model..................................................................35 4-1 Architecture of the knowledge-based information retrieval model..........................42 4-2 Architecture of the knowledge-based information retrieval model detailed in the example domain.......................................................................................................44 4-3 The Equation of SNOMED disease axes..............................................................45 5-1 The three types of SNOMED term relation.............................................................49 5-2 SNOMED hierarchical term relationship.................................................................50 5-3 SNOMED synonyms relationship............................................................................51 5-4 SNOMED Multiaxial relationship...........................................................................51 5-5 Classification of post-coordinated knowledge.........................................................55 ix

PAGE 10

5-6 An example of a four-axis-relation post-coordinated knowledge............................56 5-7 Structure of the post-coordinated knowledge in a Bayesian network......................57 5-8 PCKB component structure and probability estimation...........................................60 6-1 Knowledge reductions..............................................................................................63 6-2 Attributes of the SNN-KB hierarchical topology relation........................................64 6-3 Example of Domain-Specific Knowledge relations.................................................67 6-4 Conversion of type-M relations................................................................................68 6-5 Examples of case2....................................................................................................70 7-1 Performance evaluation metrics...............................................................................73 7-2 Comparison of performance for query1 on positive cases.......................................75 7-3 Evaluation results of query 1 including the neutral cases........................................75 7-4 Evaluation results for query 2 for the positive cases................................................76 7-5 Evaluation results for query 2 including the neutral cases.......................................76 7-6 Evaluation results of query 1 including post-coordinated knowledge.....................79 7-7 Evaluation results of query 2 including post-coordinated knowledge.....................79 7-8 Evaluation results of query 1 including statistical post-coordinated knowledge.....80 8-1 Knowledge reduction to statistical model................................................................83 8-2 Off-line application of knowledge...........................................................................83 x

PAGE 11

Abstract of Dissertation Presented to the Graduate School of the University of Florida in Partial Fulfillment of the Requirements for the Degree of Doctor of Philosophy DOMAIN-SPECIFIC KNOWLEDGE-BASED INFORMATION RETRIEVAL MODEL USING KNOWLEDGE REDUCTION By Changwoo Yoon August 2005 Chair: Douglas D. Dankel II Major Department: Computer and Information Science and Engineering Information is a meaningful collection of data. Information retrieval (IR) is an important tool for changing data into information. Of the three classical IR models (Boolean, Support Vector Machine, and Probabilistic), the Support Vector Machine (SVM) IR model is most widely used. But the SVM IR classical model does not convey sufficient relevancy between a query and documents to produce effective results reflecting knowledge except when using term frequency (tf) and inverse document frequency (idf). Knowledge is organized information imbued by intelligence. To augment the IR process with knowledge, several techniques have been proposed including query expansion by using a thesaurus, a term relationship measurement like Latent Semantic Indexing (LSI), and a probabilistic inference engine using Bayesian Networks. We created an information retrieval model that incorporates domain-specific knowledge to provide knowledgeable answers to users. We used a knowledge-based xi

PAGE 12

model to represent domain-specific knowledge. Unlike other knowledge-based IR models, our model converts domain-specific knowledge to a relationship of terms represented as quantitative values, which gives improved efficiency. xii

PAGE 13

CHAPTER 1 INTRODUCTION The object of this thesis is creating an intelligent information retrieval model producing effective results reflecting knowledge using a computationally efficient method. 1.1 Background about Intelligent Information Retrieval Conceptually, information retrieval (IR) is the process of changing data to information. More technically, information retrieval is the process of determining the relevant documents from a collection of documents, based on a query presented by the user. If we look at the World Wide Web (WWW) before any processing (e.g., search), each document or web page is a datum. These data are un-interpreted signals or raw observations that reach our senses. Providing meaning to these data allow them to become information that is more meaningful and useful to humans than the raw data. Information retrieval is the process that extracts information from data. One of the well-known information retrieval models is Boolean search. In the Boolean search model, we specify a set of query words that is compared to the words in the documents to retrieve those documents precisely containing the given set of query words. We can call the retrieved documents information but it is hard to call them knowledge, because additional tasks such as browsing each document and selecting the more meaningful ones are required to transform the retrieved documents to some form of knowledge. Knowledge is organized information. 1

PAGE 14

2 The classic vector information retrieval model is an attempt to infuse knowledge to information retrieval results using the frequency of the query terms that are found in the documents. Intelligent information retrieval or semantic information retrieval attempts to use some form of knowledge representation within the IR model to obtain more organized information (i.e., improved precision, which is defined in Section 2.4) that is knowledge. But it is difficult to codify or regulate the knowledge. An ontology is the attempt to regulate knowledge and the specification of a conceptualization (Gruber, 1993). In the artificial intelligence research fields, researchers are using an ontology such as a knowledge representation or semantic web (Berners-Lee et al., 2001), which is the abstract representation of data on the World Wide Web, in an attempt to make the semantics of a body of knowledge more explicit. We can classify an ontology as either general domain or closed domain. For example, WordNet (Miller, 1990) is an example of a general ontology (consisting of a thesaurus and a taxonomy) that aims to represent general-domain documents written in natural language. We can compare closed-domain data to general-domain data. The subject of the closed-domain is confined. For example, a company offering tourists information about excursions and outings might maintain this information in a database. Such a database would consist exclusively of tour-related data. A closed-domain typically has its own knowledge repository such as a term dictionary and relations that exist between terms. Good examples of such a repository are the medical fields Unified Medical Language System (UMLS) and Systematized Nomenclature of Medicine (SNOMED). We call these domain specific knowledge. The nature of closed-domain data allows us to use better semantics than that of general-domain data. Applying knowledge in the information retrieval process normally requires significant computation. This computation occurs when the intelligent information

PAGE 15

3 retrieval system tries to search the knowledge space during the retrieval process. From this, we can derive the following set of research questions for closed-domain IR using domain-specific knowledge: How can we express effectively the domain specific knowledge as an ontology? What is the relationship between explicit semantics, ontology, and information retrieval? How can we maximize the efficiency of IR using the given domain specific knowledge (ontology)? 1.2 Intelligent Information Retrieval Model Our research aims to create an information retrieval model that incorporates domain-specific knowledge to provide knowledge-infused answers to users. The closed-domain data we used consists of pathology patient reports. Figure 1-1 is a conceptual model of the proposed domain-specific knowledge-based information retrieval model. Details of the model are given in Chapters 4, 5, and 6. A classical vector space model (VSM) information retrieval system using term frequency and inverse term frequency creates the query vector (1) and document vector (2). The knowledge base management engine (KME) creates (5) the knowledge from the existing documents set (3) before the system operation starts. The KME adds knowledge (5) from new document (4) as they enter the database. The Knowledge Conversion Engine (KCE) applies the knowledge (semantics) of the Knowledge Base (7) to the Document Vector (6) to create the Conceptual Document Vector (8). The conventional VSM IR engine calculates the relevance between the query vector (9) and the conceptual document vector (10) resulting in a ranked document list (11).

PAGE 16

4 Query QueryVector QueryVector Documents DocumentVector DocumentVector KnowledgeBase Knowledge baseManagement engine New Document 1235 KnowledgeConversion Engine VSM IR engine ConceptualDocumentVector ConceptualDocumentVector RankedResult RankedResult 67891011 4 Figure 1-1. Knowledge-based information retrieval model Using this model results in the following contributions to information retrieval research: This information retrieval model is a knowledge-based IR model. Unlike other models, that perform knowledge level information retrieval tasks such as ontology comparison and ontological query expansion, this model reduces the knowledge level represented by the knowledge base to the information level such as the vector space models document vector. Unlike other knowledge-based IR models, which have a heavy computation requirement because they compare concepts between the IR model and the query when the user requests information, this model uses the off-line application of knowledge to the document vector leaving only a similarity measurement calculation between the query and the documents.

PAGE 17

5 When a new document arrives in the system we modify the knowledge base with only the knowledge that can be obtained and augmented from that new document, not from the pre-defined knowledge base. We call this a dynamic feature of the knowledge base. The dynamic feature of the knowledge base can be mapped to a statistical feature by off-line knowledge conversion. This means that we apply the changes of the document vector and the knowledge base in specified time intervals not when introduced. This model can be applied to IR applications in the general domain if these applications have a domain-specific knowledge ontology. Unlike other models, which have difficulty applying a knowledge hierarchy to the IR model, the knowledge-based model uses a hierarchical term relevancy value to express the knowledge hierarchy. The organization of this thesis is as follows. Chapter 2 surveys the current research efforts on information retrieval. Chapter 3 surveys the current research topics on knowledge representation and inference using probability, concentrating on Bayesian networks. Chapter 4 introduces the proposed information retrieval model for closed-domain data. Chapter 5 and 6 discuss the details of the model. Chapter 7 presents a performance evaluation of the model. The thesis concludes with Chapter 8, which provides future research work to be completed.

PAGE 18

CHAPTER 2 INFORMATION RETRIEVAL 2.1 Classical Information Retrieval Models Information retrieval (IR) is a process that finds relevant documents (information) from a document collection given a users request (generally queries). In contrast to data retrieval, which consists of determining which documents of a collection contain the keywords in the users query, an IR system is concerned with retrieving information about a subject represented by the users query. There are three classic models in information retrieval: the Boolean, the vector, and the probabilistic models (Yates and Neto, 1999, p. 21). The Boolean model is set theoretic because documents and queries are represented as a set of index terms. The vector model is algebraic because documents and queries are represented as vectors in a t-dimensional space where t is the total number of index terms. In the probabilistic model, probability theory forms the framework for modeling documents and query representations. 2.1.1 Boolean Model The Boolean model is a simple retrieval model based on set theory and Boolean algebra (Yates and Neto, 1999, p. 25). In Boolean information retrieval, a query typically consists of a Boolean expression, such as (cat OR dog) AND NOT goldfish, and each document is represented by the set of terms it contains. The execution of a query consists of obtaining, for each term in the query, the set of documents containing this term. These sets of retrieved documents are then combined using the usual set theoretic union (for OR 6

PAGE 19

7 queries), intersection (for AND), or difference (for NOT) to obtain a final set of documents that match the query. The Boolean model provides a framework that is easy to understand by a common user of an IR system. Furthermore, the queries are specified as Boolean expressions having precise semantics. But, the Boolean model suffers from two major drawbacks. First, using the Boolean model requires skilled users who can formulate quality Boolean queries. When the only users of an IR system are librarians, for example, or computer scientists conversant in logic, and the information to be searched is in a known or restricted form (such as bibliographic records), a Boolean system is adequate. However, in cases where the users are less skilled, or the information to be searched is less well-defined, a ranked strategy (vector space, probabilistic, etc.) may be more effective. The Boolean models second drawback is that its retrieval strategy is based on a binary decision criterion (i.e., a document is predicted to be either relevant or non-relevant) without any notion of a grading scale, which prevents good retrieval performance. Thus, the Boolean model is in reality much more a data retrieval model. 2.1.2 Vector Space Model The vector space information retrieval model, first introduced by Salton et al. (1975), takes a geometrical approach. A vector, called the document vector, represents each document. This vector is of identical length for all documents with the length equaling the number of unique terms in the entire collection of documents. Salton et al. (1975) defined the term weight (also known as the importance weight) as the ability of a term to differentiate one document having the term from other documents having the same term.

PAGE 20

8 A number of weighting schemes can be used in the vector space model. Salton uses two properties: the term frequency and the inverse document frequency. The term frequency (tf) is the intra-document importance, which is the frequency of the term occurring in a document. Term frequency measures how well that term describes the document content. A term with a higher term frequency is more important than a term with a lower frequency. The inverse document frequency (idf) is the number of documents in the corpus which the term occurs. The inverse document frequency of term j is calculated as jjnNidflog where N is the number of documents in the collection, and n j is the number of documents in which term j occurs. The inverse document frequency is the inter-document importance. If a term is uniformly present across the entire system, the term is less capable of differentiating the documents, which means that it has less importance than a term having a small global weight. We can calculate the term weight of term i in document j as jiw, ijijiidftfw ,, where is the term frequency of term i in document j, and is the inverse document frequency of term i in the entire set of documents. jitf, iidf After constructing the document and query vectors using the weighting scheme, we calculate the similarity coefficient. One of the best known similarity coefficients is the

PAGE 21

9 cosine measure (Salton, 1968), defined for the query vector ),,,(21tqqqq and the document vector ),,,(,,2,1jtjjjwwwd where t is the number of terms: titijiitijiijjjjwqwqdqdqdqdqsim112,21,),cos(),(. The cosign similarity measures the angle between the query and document vectors in n-dimensional Euclidean space. Suppose that we have a query consisting of two terms and a set of documents that may or may not contain those terms. Figure 2-1 illustrates the vector model and its similarity measure between two documents, d 1 and d 2 and query q which contain those terms. The similarity between document 1 (d 1 ) and the query is s 1,q ; while the similarity Figure 2-1. Vector Space Model example diag between document 2 (d2) and the query is s2,q. ram Probabilistic retrieval defines the degree of relevance of a document to a query in terms of the probability that the document is relevant to the query. Maron and Kuhns, qd1d2 wd2,2wd1,2wq2t2t1wd1,2wd1,1wq1 s1,qs2,q 2.1.3 Probabilistic Model

PAGE 22

10 (1960) first introduced the co ncept of probabilistic indexing in the context of a library searchl uation, we can define the similarity of the document, to a que ing system. Robertson and Sparck-Jones (1976) introduced what is now known as the binary independence retrieval (BIR) model, which is considered the standard modeof probabilistic retrieval. The fundamental assumption of the probabilistic model is that the probabilistic model estimates the probability of the relevancy of a document with a given users queryq. If we state this as an eq thj jd ry q as the ratio )|(),(jjdRPqdsim (2-1) where R is the set of d robability that document jd is relevant to th )|(jdRPocuments known to be relevant, R is the set of non-relevant documents, )|(jdRPi jdRP| is the probability that is non-relevant to the query q. The problem with Equation 2-1 (one disadvantage of the probabilistic model) is that we must guess the initial value of the document relevancy. The first probabilistic model, thhich is a basic assumamong ctor space model, all terms in the document vector are orthogonanas et al., 1988) is one o je BIR model, also did not consider the term frequency, w d s the pe query q, and ption of the vector space model. 2.2 Alternative Information Retrieval Models The classical information retrieval model does not consider the dependency the index terms. For examples, in the ve l. The Latent Semantic Indexing (LSI) model (Fur f the IR models that incorporates term dependency.

PAGE 23

11 2.2.1 Latent Semantic Indexing (LSI) The classical information retrieval models use index terms as querying tools. The e assumption that the terms represent the userion LSI considers documents that have 1988). ents re d e document collection and index terms. As a result, the probl selection of the index terms is based on th s need, that is they represent the concept of the users query intention. But as the search results show, index terms do not really contribute to the concepts of informatretrieval. For example, if the user wants to search about Major cities in Florida, the index terms used may be Major, city, and Florida. The search engine may try to find documents containing these keywords. But if the search engine is intelligent and supports conceptual matching, it would try to search for keywords such as Tampa, Orlando, and Miami in the same way as human do. The main idea of Latent Semantic Indexing (LSI) comes from the fact that a document may contain words having similar concepts. So many words in common to be semantically close and vice versa (Furnas et al.,From the example in the previous paragraph, if the words major, city, Florida, Tampa, Orlando, and Miami appears together in enough documents, the LSI algorithm will conclude that those terms are semantically close, then return all documcontaining terms Tampa, Orlando, and Miami even though these latter terms anot part of the given index terms. The most important point of the LSI algorithm is that all calculations are performeautomatically by only looking at th ems of Polysemy and Synonymy can be addressed efficiently without the aid of a thesaurus. Polysemy is the problem of a word having more than one meaning. Synonymy is the problem that there are many ways of describing the same object.

PAGE 24

12 LSI generally uses a statistical method called Singular Value Decompositio(SVD) to uncover the word associations between documents. The effect of SVD is n to moveot pt and right brain. The left-brain me is achieved by following a logicaAn example of sequefor the is type of query it is difficult to specif words and documents that are closely associated nearer to one another in the projected space. It is possible for an LSI based system to locate and use terms that do neven appear in a document. Documents that are located in a similar part of the concespace are retrieved, rather than only matching keywords. 2.2.2 Lateral Thinking in Information Retrieval The human brain is divided into two halves: the left excels at sequential thinking where the desired outco l sequence of actions. In contrast, the right brain is optimized for creativity where the desired outcome may require a degree of non-linear processing. Most information retrieval activity is focused on the requirements of sequential thinking, which is most comfortable when searching with precision. ntial thinking in information retrieval is a Boolean logic search. When searching specific information, traditional techniques can be used to find documents containing required keywords combined with Boolean logic. Sequential thinking, which is a process of left-brain, is an analogous term to vertical thinking. Sometimes we are looking for information about a particular topic but the concept is nebulous and difficult to articulate precisely. With th y our search so that all of the best documents are found without too many irrelevantones. These difficulties are compounded if there is uncertainty about the presence of documents, for example searches designed to gather evidence, or to prove the absence of, information about the selected topic.

PAGE 25

13 A successful outcome is likely to involve some right brain activity as we iterate the process with carefully modified searc h criteria. This kind of brain activity is called later relevant documents is relativents is l d information. Withocess. atistical schem al thinking (Bono, 1973). The lateral thinking process is concerned with insight and creativity. It is a probabilistic rather than a finite process. In an information retrieval context, vertical thinking is used when we know precisely for what we are looking and selecting the finite set of ely straightforward. In contrast, lateral thinking is applied where the requiremare less well defined and the process of locating relevant information involves some degree of trial and error. Unfortunately, traditional techniques, employed when searching with precision, do not provide much assistance with this type of problem and the userleft to try query after query until they have exhausted all permutations. The ability to automatically identify multi-word concepts is absolutely fundamentato provide some assistance to the right brain when searching unstructure ut this ability the system is simply analyzing individual word frequencies that are unlikely to make much sense to a human brain when taken out of context. Several approaches (i.e., linguistics, artificial intelligence, and Bayesian networks) have attempted to imbue concepts into the information retrieval model without much sucGiven that 90% of data is unstructured presents difficulties to the current stinformation retrieval methods. If the data are well structured like in a relational database a, where a query is very specific, we can predict a precise result that is like vertical thinking. Unfortunately, many people expect to search unstructured information in the same way and are often disappointed when the documents they expect to find are not

PAGE 26

14 returned. The problem is that unstructured data are highly variable in layout, terminoloand style while the queries tend to be more difficult to define. Yann et al. (2003) suggested using feedback from the use gy, r requests to retrieve alteres, places the match fields of interest of the user. uments related to previous and present queries will be retrieved, therefore bringing some lateral thinking abilities to the search engine. to evolve a user profil ofile rmation Retrieval Models Involving Reasoning A Bayesian network is a directed acyclic graph whose nodes represent random variabrom native documents that may not be returned by more conventional search enginin a way that may recall lateral thinking to solve heterogeneous large scale pharmaceutical database problem (Yann et al. 2003). The proposed solution requery expansion phase by a query processing phase, where evolved modules are applied to the query with two major results (Yann et al. 2003, p. 215): Rewritten queries will preferably retrieve documents that Other doc The system employs evolutionary algorithms, used interactively e at each new query. This profile is a set of modules that perform basic rewritingtasks on words of the query. The evaluation step is extremely simple: a list of documents corresponding to the processed query is presented to the user. The documents actually viewed by the users are considered as interesting, and the modules that retrieved the document are rewarded accordingly. Modules that rarely or never contribute to the retrieval of interesting documents are simply discarded and replaced by newly generated modules. He used genetic programming technique to evolve the user prmodules automatically. 2.3 Info les and whose edges represent causal relationships between nodes. A causal relationship means that if two nodes are connected, the parent node (i.e., the node f

PAGE 27

15 which the edge comes) is considered to be a potential cause of the child node (i.e., the node to which the edge points). We can consider the causal relationship as a probabilistdependency (Fung and Favero, 1995). Lee et al. (2002) also proposed a ic Bayesian network model for a medical language under s are not ye 2.4 Evaluating Information Retrieval Performance An evaluatf the computer system er l s. data e standing system, which provides a noise-tolerant and context-sensitive character ofthe system. He showed a relevant inference based on Bayesian network patterns. Those information models performing inference based on Bayesian network t at a mature stage and significant research is still needed in this area. This methodalso has a problem with the heavy computational requirements needed to perform the inference. ion of a system is usually performed before the release o Commonly, the measures of systems performance are time and space. For example, in a data retrieval system like a database system, the response time and thespace requirement are the most interesting metrics. But in the information system, othmetrics are also interesting (Yates and Neto, 1999). This results from the vagueness of a users request to an information retrieval system. The retrieval results also produce partiamatches. The most common IR system, the vector space model, produces documents ranked according to their relevance with the query. So the evaluation for information retrieval should have a metric that evaluates how precise the answer of the IR system iThe most commonly used metrics for relevancy evaluation of IR are recall and precision.Consider a database where there are 100 documents related to the general field of xtraction. A query on text mining may retrieve 400 documents. If only 40 of the retrieved documents are about data extraction, the recall rate of the tested engine is 40%,

PAGE 28

16 since the database contains 100 documents on data extraction (Schweitzer, 2003). Since only 40 documents among 400 matched the request of the user, the precision rate of theengine on this test is 10%. See Figure 2-2. If the desired set of returned documents (i.e., the target) is known, the recall rate is the proportion of returned documents that match thtarget with respect to the total size of the target. The precision is the proportion of relevant documents in the document set returned by the system. All documents e Retrieved400 Relevant100 40 Recall = RelevantRelRetrieved Precision = RetrievedRelRetrieved Figure 2-2. Recall rate and precision retrieves all documents in a document base, it has one h an tes Trivially, if an algorithm always undred percent recall. However, this retrieval has low precision because it is unlikely that all documents match the query. In this sense, precision and recall haveinverse relation shown in Figure 2-2. In many evaluations, precision is measured at a fixed number of retrieved documents, e.g. precision at 25, which gives a measure ofhow well an algorithm delivers at the top of the retrieved list. In others, recall and precision are plotted against each other: precision at a certain point of recall indicahow many irrelevant documents readers must examine until they know they have foundat least half of the interesting documents. In the Text REtrieval Conference (TREC)

PAGE 29

17 evaluations an -point average measure is used, with precision measured at everypercent of recall: at 10 percent recall, at 20 percent recall, and so forth to 100 percent recall, where all relevant documents are assumed to have been retrieved (Baeza and Ne1999, p. 76) The average precision at all those recall points is used as the total measure. 10 to, Recall 1001050060 Figure 2-3. Relationship between recall and precision r example, query expansion using synon rmed 2.5 Useful Techniques Other than the core informber of techn, s that might improve IR performance using text processing. Several methods help to maximize recall rates, fo yms. Using this method, a search engine will also find documents on data extraction provided that its thesaurus contains data as a synonym for text andextraction as synonym for mining. Significant research is currently being perfoon man-made thesauri to ensure that all documents that could match a query are actually found (Foskett, 1997). ation retrieval algorithm, there are a num iques that are mandatory for IR processing such as document preprocessingstopword removal, and stemming. This section discusses several of these technique

PAGE 30

18 2.5.1 Stopword Removal Stopwords are words that occur very frequently a mong documents in the collection. carry any useful information. Articles, prepositions, and conjuweigh of removal is be.xt the process of removing affixes (i.e., prefixes and suffixes) and val of documents containing syntactic variations of query terms (Yates and N 80), ple, the rule In general, stopword do not nctions such as in, of, the, etc., are natural candidates for a list of stopwords. Stopword removal has often been shown to be effective at improving retrieval effectiveness, even though many term weighting approaches are designed to give a lower t to terms appearing in many documents. It also has benefit on reducing the sizethe index term structure. Stopword removal is built into many IR engines. In some situation, stopword removal causes reduced recall. For example, if the users query is to be or not to be, the only index term left after stopword As a result, some search engine do not adopt stopword removal. They use full teindexing instead. 2.5.2 Stemming Stemming is allowing the retrie eto, 1999, p. 165). This can involve, for instance, removing the final s fromplural nouns or converting verbs to their base form (go and goes both become go, etc.). The most widely known stemming algorithm is the Porter algorithm (Porter, 19which is built into many information retrieval engines. The Porter algorithm uses a suffix list for suffix stripping. The algorithm has several rules applicable to the suffix of words. For exam s is used to convert plural forms into their singular forms by substitutin g the suffix letter s to nil.

PAGE 31

19 2.5.3 Passage Retrieval Passa ge retrieval is the process of retrieving text in smaller units than complete mption of passage retrieval is that terms inside a meaningful unit l ms word as an index term unit and m henever a user wants to retrieve a set of documents, he starts to construct a interest. Such a conceptualization is called the information need.e for h for of the terms in the query. These synonyms are added to the query to broaden the se documents. The basic assu ike a sentence have more meaning than across document. Callan (1994) describesseveral approaches to passage identification, including paragraph recognition and window based approaches, in which the position of the passage is determined by the positions in the document of the terms matching the query. In the classical information retrieval method, the order and distance of index terin the documents and the query have no meaning. If we use a ultiple closely located words combine to form a specific phrase, the order and distance among the index terms can have a difference when compared with the unorderedterms. 2.5.4 Query Expansion W concept about the topic of Given an information need, the user must formulate a query that is adequatthe information retrieval system. Usually, the query is a collection of index terms, whicmight be erroneous and improper initially. In this case, a reformulation of the query should be done to obtain the desired result. The reformulation process is called query expansion. One of the simplest techniques involves the use of a thesaurus to find synonyms some or all arch. The thesaurus used can be manually generated for a specific domain, such as the medical domain. But for a general domain like the Web, it is hard to generate such a

PAGE 32

20 knowledge base like thesauri because the documents from the general domain are comparably new, large, and dynamically changing. Various algorithms have been suggested for generating thesauri automaticallexample, Crouch and Yang (2000) suggest a method y. For based on clustering and term discriack. ing a preliminary search, then examining the documents return 98). t as a vector of index terms. The classical VSM uses a word as an indexcept words mination value theory. Another widely used method of query expansion is the use of relevance feedbThis involves the user perform ed and deciding which are relevant. Finally, terms from these documents are addedto the query and the search is repeated. This obviously requires human intervention and,as a result, is inappropriate in many situations. However, there is a similar approach, sometimes called pseudo-relevance feedback, in which the top few documents from an initial query are assumed relevant and are used for automatic feedback (Mitra et al. 192.5.5 Using Phrase Many information retrieval systems are based on a vector space model (VSM) that represents a documen term. To improve retrieval accuracy, it is natural to replace word stems with concepts. For example, replacing word stems with a Unified Medical Language System (UMLS) code if the document domain is medical is a possible way to include a conin information retrieval. However, previous research showed not only no improvements, but a degradation in retrieval accuracy when concepts were used in document retrieval. Replacing word stems with multiple word combinations was also studied. One study used a phrase as an indexing term (Mao and Chu, 2002). A phrase is a string of used to represent a concept. The conceptual similarity and common word stems

PAGE 33

21 jointly determine the correspondence between two phrases, which gains an increase in retrieval accuracy when compared to the classical SVM model. Separating the importance of weighting in SVM model has been suggested (Shuang et al. m-2.6.1 Using Worctronic lexical database developed at Princeton University beginidely synonym sets (synsets), which are unord ions in WordNet: semantic and lexical relations. Examic bottom specific concepts. Examples of lexical relations are synonymy and antonymy. 2004). Shuang et. al. considered phrases to have more importance than individual terms in information retrieval. They used a tuple of two separate similarity measures between the document and the query, (phrase-sim, term-sim), where phrase-sim is thesimilarity obtained by matching the phrases of the query against the documents and tersim is usual a similarity measure used in the SVM model. Documents are ranked in descending order of (phrase-sim, term-sim) where phrase-sim has a higher priority. 2.6 Enhancement of IR Through Given Knowledge dNet WordNet is an ele ning in 1985 (Miller, 1990). WordNet 2.0 has over 130,000 word forms. It is wused in natural language processing, artificial intelligence, and information technology such as information retrieval, document classification, question-answer systems, language generation, and machine translation. The basic building blocks of WordNet are ered sets of distinct word forms and which correspond closely to what are calledconcepts. Examples of synsets are {car, automobile} or {shut, close}. WordNet 2.0 contains some 115,000 synsets. There are two kinds of relat ples of semantic relations are is-a, part-of, cause, etc. An is-a semantrelation hierarchically organizes nouns and verbs from the top generic concepts to the

PAGE 34

22 There have been several attempts to use WordNet for information retrieval (Chaiand Biermann, 1997). Query expansion is one of method that expands query terms hav ing similaf (1998) h. Net 2004) shows the possibility of improving IR performance using WordNet knowledge. They proposed a root sense tagging approach. They noticed that the ion f r meaning using a thesaurus like WordNet. This technique increases the chances oretrieving more relevant documents. Several other research projects about query expansion using WordNet have been performed (Voorhees, 1994), but the results are not good: there is a small increase of recall but a degradation on precision. Rila et al. concluded that the degradation of performance for IR using WordNet is caused by the poorly defined structure of WordNet. It is impossible to find term relationships with different parts of speech because words in WordNet are grouped based on part-of-speecMost of the relationships between two terms are not found in WordNet because Wordhandles general lexical knowledge. Sanderson described most efforts in information retrieval using WordNet and noted that a simple dictionary (or thesaurus) based word sense representation has not been shown to greatly improve retrieval effectiveness (Shaderson, 2000). A recent study on word sense disambiguation in information retrieval using WordNet (Kim et al. tradition method described in the previous paragraph used a fine-grained disambiguatfor IR tasks. For example, the word stock has 17 different senses in WordNet, which are used in word sense disambiguation. These include act, animal, artifact, attribute, body, etc. Using these classifications when performing word sense disambiguation, called coarse-grained disambiguation, showed an improvement oretrieval effectiveness.

PAGE 35

23 2.6.2 Using UMLS, SNOMED Medical language is extremely rich, varied, and difficult to comprehend and and imprecision. As a result, there have been many effortguage pter 3. orts to com. ner. To etrieval models: Boolean, Vector, and Probabilistic. There are several attempts to augment knowledge in the information standardize, and it has vagueness s to make medical term dictionary structures such as the Unified Medical LanSystem (UMLS) and Systematized Nomenclature of Medicine (SNOMED). SNOMED is a hierarchically organized and systematized multiaxial nomenclature of medical and scientific terms. We provide more detail on SNOMED in Cha The terms in SNOMED and UMLS often require expert knowledge, so non-expertslike patients and lawyers cannot recognize the terms used. This problem motivates eff bine WordNet and UMLS (Barry and Fellbaum 2004), since WordNet was not built for domain specific applications, creating a need for a lexical database design created specifically for the needs of natural-language processing in the medical domainThis approach expands the synonyms thesaurus resulting in an information retrievalquery expansion. There are many efforts to visualize the concept of information. Sometimes a figure is worth a thousand words (Pfitzner et al. 2003) with the use of a picture facilitating a users understanding of the presented information. Keynets developed by Kenneth (http://www.ccs.neu.edu/home/kenb/ key/fast/fast.html) is one of information visualization techniques for representing information in a visual manextract meaning from technical documents, ontologies such as UMLS and semantic frameworks like Keynets can be combined, which improve the accuracy and expressiveness of natural language processing. 2.7 Summary We described three classical information r

PAGE 36

24 retrieval process such as query expansion and using a phrase as a searching term. Our attem de pts to incorporate knowledge in IR involve using a knowledge source directly as aform of knowledge representation. Possible candidates for knowledge sources incluUMLS and SNOMED. Our developed model uses knowledge in the form of a semanticnetwork and a Bayesian network. The next chapter explains the background required to understand the knowledge base, especially the probabilistic Bayesian network model.

PAGE 37

CHAPTER 3 KNOWLEDGE REPRESENTATION BY BAYESIAN NETWORK As we will see, the knowledge in our experimental domain (pathology) consists of two types. The first is pre-defined knowledge that can be used in describing data (i.e., a patients report). This type of knowledge can be expressed well using a semantic network. The second type of knowledge is obtained from data that are not pre-defined. Normally, experts describe this knowledge after analyzing the data. Errors will possibly intervene during the writing and analyzing process, which means there is an uncertainty in the knowledge. This type of data can be modeled well by a probability model, especially the Bayesian network. This chapter presents a discussion on knowledge representation issues, concentrating on semantic networks and Bayesian networks, and surveys some of the relevant literature. 3.1 Semantic Networks Semantic networks are often used as a form of knowledge representation. They were developed for representing knowledge within English sentences by representing human memorys structure of having a large number of connections and associations between the different pieces of information contained in it. Today, the term associative networks is more widely used to describe these networks since they are used to represent more than just semantic relations. They are widely used to represent physical and/or causal associations between various concepts or objects. 25

PAGE 38

26 A semantic network is a directed graph consisting of vertices that represent concepts and edges that represent semantic relations between the concepts. An important feature of any associative network is the associative links that connect the various nodes within the network. It is this feature that makes associative graphs different from simple directed graphs. Within knowledge-based systems, associative networks are most commonly used to represent semantic associations. In the more technically oriented applications, they can be used to express both the physical and causal structure of systems. The important semantic relations often used within a semantic network are: Meronymy (A is part of B), Holonymy (B has A as a part of itself), Hyponymy (or troponymy) (A is subordinate of B; A is kind of B), Hypernymy (A is superordinate of B), Synonymy (A denotes the same as B), and Antonymy (A denotes the opposite of B). An example of a semantic network is WordNet, a lexical database of English. A major problem of semantic networks is that although the name of this knowledge representation contains the word semantic, there is no clear semantics of the various network representations. By representing the knowledge explicitly within an associative network, a knowledge-based system obtains a higher level of understanding for the actions, causes, and events that occur within a domain. The higher level of understanding allows the system to reason more completely about problems that exist within the domain and to develop better explanations in response to user queries (Gonzalez and Dankel 1988, p. 167).

PAGE 39

27 3.2 Probability Principles and Calculus This section provides the core principles necessary to understand Bayesian calculus, which is the base model of the proposed knowledge base. This section starts with the basics of probability calculus. Then, it introduces the concept of subjective probability and conditional probability. Probability is a method for articulating uncertainty. It also gives a quantitative understanding of uncertainty providing a quantitative method for encoding likelihood. Probabilistic methods and models give us the ability to attach numbers to the likelihood of various results. The standard view of probability is the frequentist view. This view says that probability is really a statement of frequency. You can obtain a probability by watching recurring events repeat over time. For example, the probability of a hurricane hitting Florida during hurricane season can be determined by examining the historical record of where hurricanes have struck the USA. In this view, probability is something that is inherent in the process. An alternative view of probability that is very useful to artificial intelligence research is the subjective view, or Bayesian view. In the subjective view, probability is a model of your degree of belief in some event. A Bayesian probability is the value or belief of the person who assigns the probability (e.g., your degree of belief that a coin will land heads), whereas a classical probability is based on the physical properties of the world (e.g., the probability that a coin will land heads). In light of these statements, a degree of belief in an event is referred to as a Bayesian or personal probability, while the classical probability is referred as the true or physical probability of that event.

PAGE 40

28 Probability is a logic and a language for talking about the likelihood of events. An event, is a set of atomic events, which is a subset of the universe of all events. A probability distribution is a function that maps events into the range of values between 0 and 1. Probability satisfies the following properties. P(true) = 1 = p(Universe), P(false) = 0 = P(), and P(A B) = P(A) + P(B) P(A B). A random variable describes a probability distribution in which the atomic events are the possible values that could be given to the variable. If we have multiple random variables, we can talk about their joint distribution or the probability assignment to all combinations of the values of the random variables. In general, the joint distribution cannot be computed from the individual distribution. If we know all values of joint distribution, we can answer any probability question. But if the domain is big, the complexity grows exponentially. We can introduce a concept of conditional probability. P(A|B) = P(A B)/P(B) (3-1) This is the probability of A given B and states we are restricting our consideration just to the part of the world in which B is true. We can derive Bayes rule from the definition of conditional probability. P(A|B) = P(B|A)P(A)/P(B) (3-2) To make this more concrete, consider the medical domain where we have diseases and the symptoms associated with each disease: P(disease|symptom) = P(symptom|disease) P(disease)/P(symptom).

PAGE 41

29 The probability of a symptom given a disease is generally constant and does not change according to the particular situation or patient. So it is easier, more useful, and more generally applicable to learn these causal relationships. So Bayess rule has practical importance on conditional probability. We can use the conditioning rule to obtain P(A). P(A) = P(A|B) P(B) + P(A|~B) P(~B) = P(A B) + P(A ~B) We say A and B are independent, if and only if the probability that A and B are true is the product of the individual probabilities of A and B being true. P(A B) = P(A) P(B) P(A | B) = P(A) P(B | A) = P(B) Independence is essential for efficient probabilistic reasoning. There is a more general notion, which is called conditional independence. This states that A and B are conditionally independent given C if and only if the probability of A given B and C is equal to the probability of A given C. P(A|B,C) = P(A|C) P(B|A,C) = P(B|C) P(A B |C) = P(A|C) P(B|C) We can solve the Bayesian network probability distribution using Bayes rule and conditional independency. ),()()|,(),|(XTPCPCXTPXTCP Assume T and X are conditionally independent given C. ),()()|()|(),|(XTPCPCXPCTPXTCP

PAGE 42

30 Figure 3-1. Example of the probability for combined evidence We can obtain P(T,X) by the following equation. P(C|T,X) + P(~C|T,X) = 1 1),()(~)|~()|~(),()()|()|(XTPCPCXPCTPXTPCPCXPCTP ),()(~)|~()|~()()|()|(XTPCPCXPCTPCPCXPCTP 3.3 Bayesian network A Bayesian network is an efficient factorization of the joint probability distributions over a set of variables. If we want to know everything in the domain, we need to know the joint probability distribution over all those variables. If the domain is complicated, with many different prepositional variables, the solution is infeasible. For example, if you have N binary variables, then there are 2 n possible assignments, and the joint probability distribution requires a number for each one of those possible assignments. The intuition of Bayesian network is that there is almost always some separability between the variables (i.e. some independence), so that we do not actually have to know all of those 2 n numbers to know what is occuring in the world. Bayesian networks have two components. The first component is called the causal component. It describes the structure of the domain in terms of the dependencies between variables, and the second part is the actual numbers, the quantitative part.

PAGE 43

31 There are three connection types in Bayesian networks. First is the forward serial connection shown in Figure 3-2. Evidence is transmitted from A to C through B unless B is instantiated (i.e., its truth value is known). The evidence propagates backward through the serial links as long as the intermediate node is not instantiated. If the intermediate node is instantiated, then evidence does not propagate. Figure 3-2. Forward serial connection Bayesian network example Figure 3-3. Diverging connection Bayesian network example Figure 3-4. Converging connection Bayesian network example The second connection type is the diverging connection shown in Figure 3-3. In a diverging connection, there are arrows going from B to A and from B to C. If B is not instantiated, the evidence of A propagates through to C. But if B is instantiated, the propagation is blocked. The tricky case is when we have a converging connection like Figure 3-4. A points to B and C points to B. Let us first think about the case when neither B nor any of its descendants is instantiated. In that case, evidence does not propagate from A to C. For example, suppose B is sore throat, A is Bacterial infection, and C is Viral

PAGE 44

32 Infection. If we find that someone has a bacterial infection, it gives us information about whether they have a sore throat, but it does not affect the probability that they have a viral infection also. But when either node B is instantiated, or one of its descendents is, we know something about whether B is true. And in that case, information does propagate through from A to C. If two variables are d-separated, then changing the uncertainty on one does not change the uncertainty on the other. Two variables a and b are d-separated if and only if for every path between them, there is an intermediate variable V such that either: the connection is (serial or diverging) and v is known; or the connection is converging and neither v nor any descendent has evidence. For example, if the connection ABC is serial, it is blocked when B is known and connected otherwise. When it is connected, information can flow from A to C or from C to A. Bayesian networks are sometimes called belief networks or Bayesian belief networks. A Bayes net consists of three components: a finite set of variables, each of which has a finite domain, a set of directed arcs between the nodes, forming an acyclic graph; and every node A, with parents B1 through Bn has a conditional probability distribution, P(A|B1Bn) specified. The crucial theorem about Bayesian networks is that if A and B are d-separated given some evidence e, then A and B are conditionally independent given e; that is, then P(A|B,e) = P(A|e). We can exploit these conditional independence relationships to make inference efficient. The chain rule results from the conditional independence relationship of Bayesian networks. Let us assume there are n Boolean variables: V1,, Vn.. The joint probability

PAGE 45

33 distribution is the product of all the individual probability distribution that are stored in the nodes of the graph. ))(|(),,2,1(21iiinVparentsviVPvnVvVvVP (3-3) Figure 3-5. Example of chain rule If we compute the probability that A, B, C, and D are all true, we can use conditioning to write that. P(ABCD) = P(D|ABC)P(ABC) We can simplify P(D|ABC) to P(D|C), because given C, D is d-separated from A and B. And we have P(D|C) stored directly in a local probability table, so we are done with this term. Now we can use conditioning to write P(ABC) as P(C|AB) times P(AB). These can be changed by d-separation. P(ABC) = P(C|AB)P(AB) = P(C|AB) P(A) P(B) For each variable, we just have to condition on its parents. Then, we multiply the results together to obtain the joint probability distribution. This means that if you have any independence (if you have anything other than all the arrows in your graph in some sense), then you have to do less work to compute the joint distribution. 3.4 Noisy-OR: Bayesian network inference

PAGE 46

34 Imagine that there are three possible causes for having a fever: flu, cold, and malaria. The network of Figure 3-6 encodes the fact that flu, cold, and malaria are mutually independent of one another. Figure 3-6. Example of Noisy-OR In general, the conditional probability table for fever will have to specify the probability of fever for all possible combinations of values of flu, cold, and malaria. This is a large table, and it is hard to assess. Physicians, for example, probably do not think very well about combinations of diseases. It is more natural to ask them individual conditional probabilities: what is the probability that someone has a fever if they have the flu? We are essentially ignoring the influence of cold and Malaria while we think about the flu. The same goes for the other conditional probabilities. We can ask about P(fever|cold) and P(fever|malaria) separately. We are assuming that the causes act independently, which reduces the set of numbers that we need to acquire. If the patient has flu, and the connection is on, then he will certainly have fever. Thus it is sufficient for one connection to be made from a positive variable into fever from any of its causes. If none of the causes are true, then the probability of fever is assumed to be zero (though it is always possible to add an extra cause that is always true, but which has a weak connection, to model the possibility of getting a fever for no reason). Here is the general formula for a noisy OR. Assume we know P(effect|cause) for each possible cause. And, we are given a set, C T of causes that are true for a particular

PAGE 47

35 case. Then to compute the probability of E given C, we compute the probability of not E given C. P(E|C) = 1 P(~E|C) (4) That is equal to the probability of not E just given the causes that are true in this case, C T And because of the assumption that the causes operate independently (that is, whether one is in effect is independent of whether another is in effect), we can take the product over the causes of the probability of the effect being absent given the cause. C1 C3 C2 Effect Figure 3-7. General architecture of noisy-OR model Finally, we can easily convert the probabilities of not E given C, into 1minus probability of E given C. )|(CEP ))|(1(1)|(~1)|(~1)|( ~ 1 TiTiCCiCCiTCEPCEPCEPCEP (3-5) 3.5 QMR-DT model The QMR-DT model is a two-level or bi-partite Bayesian network intended for use as a diagnostic aid in the domain of internal medicine. We provide a brief overview of the QMR-DT model here; for further details see Shwe and Cooper (1991).

PAGE 48

36 The QMR-DT model is a bipartite graphical model in which the upper layer of nodes represents diseases and the lower layer of nodes represent symptoms. There are approximately 600 disease nodes and 4000 symptom nodes in the database proposed by Shwe and Cooper (1991). The evidence is a set of observed symptoms, which is referred as findings. We use the symbol f to represent the vector of findings. The symbol d denotes the vector of diseases. All nodes are binary, thus the components f i and d i are binary random variables. The diseases and findings occupy the nodes on the two levels of the network, respectively, and the conditional probabilities specifying the dependencies between the levels are assumed to be noisy-OR gates (Pearl 1988). There are a number of simplifying assumptions in this model. In the absence of findings, the diseases appear independent from each other with their respective prior probabilities (i.e., marginal independence), although some diseases probably do depend on other diseases. Second, the findings are conditionally independent given the diseases. The probability model implied by the QMR-DT belief network can be written by the joint probability of diseases and finding as jjiidPdfPdPdfPdfP)()|()()|(),( (3-6) where d and f are binary (1/0) vectors referring to the presence/absence states of the diseases and the positive/negative states or outcomes of the findings, respectively. The prior probability of the diseases, were obtained by Shwe et al. from archival data. The conditional probabilities, for the findings given the states of the diseases, were obtained from expert assessments and are assumed to be noisy-OR models: )(idP )|(dfPi

PAGE 49

37 ipajjiiidfPLfPdfP)|0()|0()|0( (3-7) )(0)1()1(ijdijijqq (3-8) where (parents of i) is the set of diseases pertaining to finding is the probability that the disease j, if present, could alone cause the finding t ipa if )1|0(jiijdfPq i have a positive outcome, and )|0(0LfPqii is the leak probability, i.e., the probability that the finding is caused by means other than the diseases included in the belief network model. The effect of each additional disease, if present, is to contribute an additional factor of to the probability that the ith finding is absent. )1(ijq 3.6 Bayesian Classifiers In this section, we introduce some of the classifiers of the form of Bayesian network that can be used in the modeling of medical diagnosis. We can define the classification problem as a function assigning labels to observations (Miquelez et al. 2004, p. 340). If there is a vector nnxxx),,(1 and classes of variable C, we can regard the classifier as a function |}|,,2,1{),,(:1Cxxn that assigns labels to observations. This can be rewritten to obtain the highest posterior probability, i.e. ),,|(maxarg)(1ncxxcp x We can use the Bayesian classifier in medical diagnostics to find the probable disease from the given symptoms. We will use the notation O meaning outcome for class variable C, and F meaning finding for the observed variables for the explanation in the following chapters. We use capital letters for variable name and small letters for the values.

PAGE 50

38 3.6.1 Nave Bayes The concept that combines the Bayes theorem and the conditional independence hypothesis is proposed by several names: idiot Bayes (Ohmann et al. 1988), nave Bayes (Kononenko, 1990), simple Bayes (Gammerman and Thatcher 1991), or independent Bayes (Todd and Stamper 1994). The nave Bayes (NB) approach (Minsky, 1961) is the simplest form of classifier based on Bayesian networks. The outcome variable O is defined as the common parent of the findings, },,{1nFF F and each of the findings is a child of the outcome variable O. The shape of network is always same: all variables are considered to be conditionally independent given the value of the outcome variable O, which is a main assumption of NB. iF nFF,,1 This is a conditional probability model. We can calculate the posterior probability using Bayes rule and conditional independence. niinnnOFPOPFFPOFFPOPFFOP1111)|()(),,()|,,()(),,|( The main advantage of this approach is that the structure is always fixed and simple to calculate because the order of dependences to be found is fixed and reduces to two variables. The number of conditional probability distribution would result in a considerable reduction in the number of parameters necessary. The Nave Bayes model only requires 2n+1 parameters, where n is the number of parents of F )|(iFOp i whereas the joint probability requires parameters. But there is no relationship between findings that is not realistic in the real world. There is extensive literature showing even these kinds of simple computational models can perform surprisingly well (Domingos and Pazzani 1997) and are able to obtain results comparable to other more complex classifiers. n2

PAGE 51

39 3.6.2 Selective Nave Bayes The selective nave Bayes is a subtly different model compared to the nave Bayes with the selective feature of findings. In the selective nave model, not all variables have to be present in the final model (Kohavi and John 1997; Langley and Sage 1994). There is a restriction that all variables must appear in the nave Bayes model for some types of classification problems, but some variables could be irrelevant or redundant for classification purposes. It is known (Liu and Motoda 1998; Inza et al. 2000) that the nave Bayes paradigm degrades with some cases, so the motivation of removing variables is modeled in the selective nave Bayes (Miquelez et al. 2004, p. 340). 3.6.3 Seminave Bayes The intuition in the seminave Bayes model is that we can combine variables (i.e., findings) together (Kononenko, 1991). It allows groups of variables to be considered as a single node in the Bayesian network, aiming to avoid the strict premises of the nave Bayes paradigm. 3.6.4 Tree Augmented Nave Bayes In the tree augmented nave Bayes, (Friedman et al.1997) the dependencies between variables other than C are taken into account. The model represents the relationships between the variables, ,, conditional on the class variable C by using a tree structure. The tree augumented nave Bayes structure is built using a two-phase procedure. Firstly, the dependencies between the different variables ,, are learned. This algorithm uses a score based on information theory, and the weight of a branch (,) on a given Bayesian network S is defined by the mutual information measure conditional on the class variable as 1X nX 1X nX iX jX

PAGE 52

40 )|,(CXXIji cxxjijijijicijcxPcxPcxxPcxxPcCXXIcP)|()|()|,(log),,()|,()( With these conditional mutual information values the algorithm builds a tree structure. In the second phase, the structure is augmented into the nave Bayes paradigm. 3.6.5 Finite Mixture (FM) model The finite mixture (FM) model tries to relax the conditional independence assumption in the Nave Bayes model (Cheeseman and Stutz 1996). In a FM model, all the dependencies between observed variables, both the findings and outcome variable, are assumed to be modeled by a single discrete latent (i.e., unobserved) variable (Monti and Cooper 1998, p. 593). In a FM model the outcome variable is itself a child node, and the common parent is a latent variable. 3.7 Summary We described two knowledge representation models: semantic networks and Bayesian networks. There are attempts to model medical diagnosis using probabilistic Bayesian models. Shwes QMR-DT model is a two-level noisy-OR model using disease and symptoms nodes, where the nodes in the same layer are independent. The QMR-DT model uses several assumptions to reduce the complexity of the joint probability distribution calculation, but it shows exponential complexity time when implemented as an algorithm. There were several attempts to use Bayesian classifiers in a medical diagnosis model: nave Bayes, selective nave Bayes, seminave Bayes, tree augmented nave Bayes, finite mixture model, and finite mixture augmented nave Bayes. Unlike the other models modeling of dependency among findings, nave Bayes assumes conditional

PAGE 53

41 independency among the findings. But even the simplicity of the modeling, nave Bayes shows good performance when compared to other complex models. The next chapter explains the overall architecture of Knowledge-based Information Retrieval (KBIR) model that uses semantic networks and nave Bayes as a knowledge model.

PAGE 54

CHAPTER 4 KNOWLEDGE BASED INFORMATION RETRIEVAL MODEL ARCHITECTURE This research developed a knowledge base information retrieval model for a closed domain. Figure 4-1 is the architecture of the model. Query QueryVector QueryVector Documents DocumentVector DocumentVector KnowledgeBase Knowledge baseManagement engine New Document 1a2a3a5a3b KnowledgeConversion Engine VSM IR engine ConceptualDocumentVector ConceptualDocumentVector RankedResult RankedResult 2b3d4a1b1c1d 5b Figure 4-1. Architecture of the knowledge-based information retrieval model The overall operation of model is as follows. A classical vector space model (VSM) information retrieval model using term frequency and inverse term frequency creates a query vector (1a) and a document vector (2a). The knowledge base management engine (KME) creates (3b) knowledge from the set of existing documents (3a) before the 42

PAGE 55

43 system operation starts. The KME processes and adds knowledge from any new documents (5b) added to the document space. The Knowledge Conversion Engine (KCE) applies the knowledge (semantics) of the Knowledge Base to the Document Vector (2b, 3d) to create the Conceptual Document Vector (4a). The conventional VSM IR engine calculates the relevance between the query vector and the conceptual document vector (1b, 1c) resulting in a ranked document list (1d). To illustrate proof of concept, we implement this model in the domain of pathology. Figure 4-2 is a detailed architecture of the resulting model. The edges of this diagram represent procedures or actions taken in processing the nodes, which represent data or subsystems. Among the procedures shown by the edges, the bold edge processes (1a, 1b, 1c) are on-line processes, while edges shown with normal lines are off-line processes completed before the start of any users query processing. For this domain the knowledge base is named the SNOMED Semantic Network Knowledge Base (SNN-KB). The SNN-KB is part of the KME developed from the off-line processing (4a) of SNOMED. The documents used in the pathology domain are pathology reports called Anatomic Pathology (AP). Because we preprocessed AP raw text data into a database, the actual data from the documents used in this system are contained in the Anatomic Pathology Database (APDB). The Document Vector is produced (2b) from the APDB, and the KME creates (2a) the dynamic parts of the SNN-KB. When a new document is added (3a), the KME modifies (3b) the Document Vector and the SNN-KB. The Knowledge Conversion Engine (KCE) initially makes the Conceptual Document Vector (5c) from the Document Vector and the KMEs SNN-KB (5a, 5b). Periodically the KCE

PAGE 56

44 updates the Conceptual Document Vector (CDV) to reduce the computational needs rather than updating the CDV every time a new document is added. Query QueryVector DocumentsAPDB DocumentVector Knowledge baseManagement engine New Document KnowledgeConversion Engine VSM IR engine ConceptualDocumentVector RankedResult RankedResult SNN-KB SNOMED 1a1b1c1d2a2b3a4a5a5b5c 3b Figure 4-2. Architecture of the knowledge-based information retrieval model detailed in the example domain Before we describe the Knowledge Base Management Engine, we describe SNOMED and the characteristics and pre-processing of the example data: the Anatomic Pathology Database (APDB). 4.1. SNOMED Surgical Pathology, cytology, and autopsy reports are highly structured documents describing specimens, their diagnoses, and retrieval and charge specification codes. The Systematized Nomenclature of Medicine (SNOMED) developed by the College of American Pathologists is used for a retrieval code. This was developed in collaboration with multiple professional societies, scientists, physicians, and computer consultants [Systematized, 1979]. SNOMED II is a hierarchically organized and systematized

PAGE 57

45 multiaxial nomenclature of medical and scientific terms. There are six main axes based on the nature of man. These begin with a hierarchical listing of anatomical systems, known as the Topography (or T) axis. Any change in form of topography structures throughout life is characterized in the Morphology (or M) axis. Causes or etiologies for those changes are listed in the Etiology (or E) axis. All human functions, normal and abnormal, are listed in the Function (or F) axis. Combinations of Topography, Morphology, Etiology, and Function may constitute a disease entity or syndrome and are classified in the Disease (or D) axis. Using the T, M, E, F, and D axes it is possible to code nearly all-anatomic and physiologic features of a disease process as shown by the example in Figure 4-3. T + M + E + F = D Lung + Granuloma + M.tuberculosis + fever = Tuberculosis Figure 4-3. The Equation of SNOMED disease axes There is another field that is not part of the disease equation: a Procedure field, classified in the Procedure (or P) axis, which allows identification of services or actions performed on behalf of the patient with the problem. Pathology reports typically consist of useful, apt, and concrete terms in sentence or template format. The diagnostic terminology in reports and SNOMED involve standard terms and acceptable synonyms, both have the same SNOMED code number (e.g., Pneumonia and pneumonitis are coded T28000 M40000 or lung + inflammation). Pathology reports usually contain a specific field for SNOMED codes. Certain anatomic pathology computer systems include SNOMED files that allow code selection, but automated encoding programs are uncommon. Pre-coded synoptic templates of diagnostic terms allow consistency for diagnostic encoding, but many diagnostic

PAGE 58

46 statements contain descriptive language, semantic forms, and linguistic nuances that make automated coding difficult. There is a continual need for error checking. 4.2 Anatomic Pathology Database (APDB) Design and Development Two important characteristics of the APDB patient records are their fixed data and closed domain. The systems target data are patient records from 1980 to the present, which we consider as fixed or static, meaning that any dynamic features of the system is minimized. The nomenclature used in a patient report is restricted to the domain of anatomic pathology and related areas of medicine, making it a relatively closed domain. These features provide a good environment and structure for constructing a knowledge base. Among the several forms of knowledge representation commonly used, the semantic network is widely used for representing simple hierarchical structures. Because SNOMED has a hierarchical architecture, we adopted the semantic network for the knowledge representation method. 4.2.1 Metadata Set Definition Appendix A shows the metadata set definition used to parse the patient surgical pathology records. There are 25 terms that must be located and separated in the current patient record. These terms serve as attributes in the database table. Because some term names have changed through the years, several synonyms exist for some terms. For example, SURGICAL PATH NO, ACC#, and CYTOLOGY NO have the same meaning: the sequential number of the patient record in the set. The parser, a batch program, processes the patient record and creates an output file containing separate patient record fields. The Database (DB) loader reads the output generated by the parser then stores the results to the DB. The parser also generates an

PAGE 59

47 index file that has proximity information among the words inside the gross description and diagnosis. This can be used in multiple keyword information searches. The proximity information is needed to rank the relevant results. 4.2.2 Information Processing: Retrieval and Extraction There are several distinct advantages in processing the pathology patient data. First, the patient record data from 1982 to the present are unique to the University of Florida. This reflects a unique character, both regionally and periodically. Thus, when the parsing is finished, the analysis of the frequency of words and multiple word terms has significant meaning. Second, because the patient reports are expressed in standard medical language (which varies slightly from physician to physician), the terms used are sometimes not an exact match to the SNOMED terms. This makes it useful to analyze the patient reports based on the SNOMED terms. Patient reports also have a field that shows matching SNOMED codes with the . The analysis of the SNOMED code frequency throughout the patient records can give a valuable research sense to the pathologist. These types of analysis can be done statically and can be reported all at once. While this static analysis is extremely useful, most information processing should be done dynamically. We cannot imagine or anticipate all requests that might be made of this knowledge base. So for information retrieval purposes, the terms in the documents were analyzed. This provided the relation between the documents and the terms in the form of a proximity value. 4.3 Summary We showed the architecture of the developed knowledge based information retrieval model. The model shows well-separated sections of on-line and off-line

PAGE 60

48 calculation to provide efficiency in the calculations during the document retrieval process. The knowledge reduction technology enables the off-line adaptation of knowledge, which is a distinct modeling concept compared to other knowledge-base models incorporating knowledge processing in their retrieval process. We talked about the experimental domain: pathology and SNOMED. In the next chapter, we describe the details of this model. First, we describe the Knowledge Base Management Engine (KME) and a knowledge base structure that contains the domain specific knowledge in Chapter 5. Second, we provide details on the Knowledge Conversion Engine (KCE) in Chapter 6. There, we describe the query vector, the document vector, and the conceptual document vector. The VSM IR engine uses the same methods as the conventional vector models querya document relevancy calculation method.

PAGE 61

CHAPTER 5 KNOWLEDGE BASE MANAGEMENT ENGINE The knowledge base for this KBIR system is the Systematized Nomenclature of Medicine (SNOMED). In this chapter we discuss the SNOMED based knowledge model, which consists of pre-coordinated knowledge and post-coordinated knowledge. The pre-coordinated knowledge is knowledge described in SNOMED that is coded by a Pathologist. We can say that this knowledge is the expert knowledge that the Pathologist used in writing and understanding a patients report. The post-coordinated knowledge is a special form of knowledge that can be obtained from a patients report. This is augmentable knowledge that can be found from the introduction of new data. The knowledge base uses the constructed model in the information retrieval process. 5.1 Semantic Network Knowledge Base Model Representing SNOMED SNOMED is a detailed and specific coded vocabulary of names and descriptions used in healthcare. It is explicitly designed for use in computerized patient records. We can classify the term-to-term relationships, which are called the pre-coordinated relationships in SNOMED as one of three types. See Figure 5-1. Hierarchical Topology (has-a) Synonymy (is-a)Multi-axial relation Figure 5-1. The three types of SNOMED term relation 49

PAGE 62

50 The first type is a hierarchical topology. The SNOMED terms are all arranged in a hierarchy, represented by an alphanumeric code where each digit represents a specific location in the hierarchy. Figure 5-2 illustrates the hierarchical structure of this knowledge modeled as a semantic network. Arcs expressing the part of or has-a relation connect the nodes of this network. Moving from a lower level concept to a higher level is generalization, while moving in the opposite direction is specialization. T28000Lung T28100Right Lung T28500Left Lung T28110Right Lung,apex T28120Right Lung, base HierarchyGeneralizationSpecialization Figure 5-2. SNOMED hierarchical term relationship SNOMED has controlled vocabulary characteristics. A controlled vocabulary allows individuals to record data in a patients record using a variety of synonyms, where each references a primary concept. For example, in SNOMED, the following terms are classified as symptoms of increased body temperature: FEVER, PYREXIA, HYPOTHERMIA, and FEBRILE. Each carries the same term code. Figure 5-3 illustrates another example using the semantic network form. We call the relationship of synonyms an is-a relationship. The synonym relation is explicit each other. There is no propagation among the nodes.

PAGE 63

51 D0110Disease caused by bacteria D0110Bacterial infectiousdisease D0110Bacterialsepsis D0110Bacterial infection Synonymy Figure 5-3. SNOMED synonyms relationship D4094Favism E6921Fava bean F21440G-6PD Figure 5-4. SNOMED Multiaxial relationship The third relationship of SNOMED terms is a multi-axial relation shown in Figure 5-4, which refers to the ability of the ordered set of names to express the meaning of a concept across several axes. We can find examples of this relationship over all axes with it most apparent in the disease axis. The SNOMED D code representing Tuberculosis has an information link to the T code representing Lung. This relationship is pre-coded, mirroring the knowledge encoded at the time of SNOMEDs standardization.

PAGE 64

52 5.2 Classification of the Post-Coordinated Knowledge The domain-specific knowledge of our model handles only multi-axial relationships among the three types of SNOMED relations. This relationship is most apparent in the disease axis with a series of codes from other axes of SNOMED comprising the essential characteristics of a disease. As detailed in Section 4.1, SNOMED consists of six categories: Topography, Morphology, Etiology, Function, Disease, and Procedures. A patient report has terms showing matching SNOMED categories and numbers. It is possible to code most of the anatomic and physiologic elements of a disease process, both normal and abnormal, with the combination of the five axes. These elements are often used to summarize a codable class of disease or a recognized syndrome, basically what is called the SNOMED equation shown in Figure 4-3. Some of the relations are straightforward but often cases have unique relationships based on the patients report. It is possible to develop a unique knowledge base using these relationships. We can find statistics within the pathology document space that form the basis of the post-coordinated knowledge, then we classify the extracted post-coordinated knowledge. 5.2.1 Statistics of Pathology Patient Report Documents Space We examined Anatomic Pathology (AP) data sets from 1983 to 1994. There are a total of 290,346 data sets. Table 5-1 shows the number of data each year. From the data set, we extracted the SNOMED codes from each documents. The SNOMED codes represent the semantics of each document. Table 5-2 identifies the number of unique SNOMED axes. Appendix B is a partial list of unique SNOMED codes found in the patient reports.

PAGE 65

53 Table 5-1. Number of AP data each year from to Year Number of sets 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 17,351 23,186 22,781 22,928 22,965 26,663 27,486 27,814 25,497 23,755 24303 25635 Total 290,346 Table 5-2. Number of unique SNOMED axes equations Axis Number of unique occurrence Total occurrence T M E F D P 3,759 4,460 315 413 771 637 702,942 594,870 137,057 44,278 11,001 348,716 Total 10,355 1,838,864 Table5-3 is the number of distinct relations between axes. From the statistical data, we can calculate the base prior probability of the nave Bayes based post-coordinate knowledge structure that is explained in Section 5.3. 5.2.2 Classification of Post-Coordinated Knowledge From the SNOMED codes of each document, we can extract post-coordinated knowledge. Because of the uncertainty of the world, the pathologist does not know or describe the SNOMED equation exactly. This means there will be a partial description of knowledge. We only count the description of SNOMED code as post-coordinated knowledge if they contain the D axis. If the pathologist described SNOMED code

PAGE 66

54 including D, there is acceptable certainty of that a SNOMED equation exists. Figure 5-5 shows the four kinds of SNOMED equations found in documents space. Table 5-3. Relation statistics among axes Axis Number of unique relations Related axis Number of two-axis relations M 34354 E 979 F 972 D 1515 T 48170 P 10299 T 34354 E 1160 F 1268 D 1684 M 75190 P 12480 T 979 M 1160 F 57 D 75 E 2999 P 527 T 972 M 1268 E 57 D 190 F 3229 P 486 T 1515 M 1684 E 75 F 190 D 4706 P 1067 Table 5-4 shows the amount of post-coordinated knowledge found in the document space. We use this knowledge to induce possible diseases from incomplete SNOMED equations (i.e., equations lacking a disease axis).

PAGE 67

55 T D (a) Two axis relation T D E T D F T D M (b) Three axis relationships T D E F T D M E T D M F (c) Four axis relationships T D M E F (d) Five axis relationship Figure 5-5. Classification of post-coordinated knowledge Table 5-4. Statistics on post-coordinated knowledge Post-coordinated knowledge relations Number of unique relations D-T 568 D-T-E 26 D-T-F 38 D-T-M 7,425 D-T-E-F 3 D-T-M-E 305 D-T-M-F 534 D-T-M-E-F 68

PAGE 68

56 5.3 Statistical Model of the Post-Coordinated Knowledge Figure 5-6 shows an example of a four-axes-relation post-coordinated knowledge. We define link frequency (lf) as the total number of links in the code-to-code relation context after parsing the current patients report. The link frequency shows the closeness of the relationship, the larger the closer. TLungD: Tuberculosis Disease axis termlf=230lf=140lf=54lf=78lf=376lf=1480lf=378Other axis terms Type-M relationsType-D relations MGranuloma EM.tuberculosis FFever Figure 5-6. An example of a four-axis-relation post-coordinated knowledge We can obtain the post-coordinate knowledge by searching the complete SNOMED equation from documents described in the previous section. Then, we can obtain the link frequency of each relation between two axes statistically following the induction of complete knowledge from the incomplete SNOMED equation. We use the link frequency, discussed in Chapter 6, for conversion of statistical model of post-coordinated knowledge. 5.4 Nave Bayes Model of Post-Coordinated Knowledge It is possible to create or learn a Bayesian network from the data. This is an instance of the problem, known in the statistics literature as density estimation. We can estimate the probability density (that is, a joint probability distribution) from the data. When we learn a Bayes network from the data, there are four different cases: structure

PAGE 69

57 known or unknown, and all variables observable or some unobservable. For our case, the structure is known and some variables are unobservable. To model post-coordinated knowledge, we have several assumptions: We consider only the knowledge consisting of a SNOMED equation. Figure 5-7 shows the basic architecture of a SNOMED equation expressed using a Bayesian network. We assume we have complete knowledge before processing a patients report. The complete knowledge can be obtained from searching complete SNOMED equations from the documents space. We call this complete knowledge as a post-coordinated knowledge. The post-coordinated knowledge consists of combination of the five axes with the disease axis being mandatory. Complete knowledge is unique. Each disease is independent. The four axes (T, M, E, and F) are independent of each other. T, M, E, and F are conditionally dependent upon the instantiation of D. D T F E M Figure 5-7. Structure of the post-coordinated knowledge in a Bayesian network In our case, the structure of a Bayes network is fixed. It has one of the forms shown in Figure 5-5. We can consider the knowledge complete only if there is disease axis in the SNOMED equation (i.e., in the document). We use the following algorithm to extract the knowledge. 1. Look through the documents to find a SNOMED equation in the document having the complete post-coordinated knowledge form shown in Figure 5-5.

PAGE 70

58 2. Extract only the complete knowledge form from the documents retrieved. 3. Use an expert to verify that the extracted knowledge is correct. Generally, we can consider the equation to be complete if it contains a D axis. 4. Add the extracted and verified knowledge into the systems knowledge the Post-coordinated knowledge base (PCKB). It is possible that individual document can contain incomplete knowledge due to a lack of expert knowledge or an error. This means some variables of the Post-coordinated knowledge base (PCKB) are not observable in some documents. In that case, we must induce the value of the unobserved variables in the complete PCKB. To do this, we need to estimate the probability values of the PCKB structures variables. It is easier to start by estimating P(D). This is computed by counting how many times D is true (=found positive) in data set (documents) and dividing by n, the total numbers of documents. To obtain an estimate of the probability that T is true given that D is true, we just count the number of cases in which T and D are both true, and divide by the number of cases in which D is true. The probability of T given not D is similar shown below. ntrueDDP)(#)( )(1)(~DPDP )(#)(#)|(trueDtrueDtrueTDTP )(#)(#)|~(falseDfalseDtrueTDTP There is one problem with this approach. There will be situations where the number of D is true, D is false, or T and D are true cases is 0. In those situations we calculate a value of 0 for that probability. Because we start from the base knowledge

PAGE 71

59 structure, the later case should not occur, but it is possible for the number of D is true or D is false cases to be 0. To guard against this, we can apply a Bayesian correction to our estimates. This means, essentially, initializing our counts at 1 rather than at 0. So, we add a 1 to the count in the numerator, and a value m to the denominator, where m is the number of possible different values the variable whose probability we are estimating can have. In our case, all variables are binary, so we add a 2 in the denominator. The new formula looks like the following. 2)(#1)(#)|( trueDtrueDtrueTDTP 2)(#1)(#)|~( falseDfalseDtrueTDTP Processing documents to obtain PCKB results in m components of PCKB. Each PCKB has the probability estimations shown in Figure 5-8. 5.5 Summary We described the Knowledge-base Management Engine (KME) modeling SNOMED pre-coordinated and post-coordinated knowledge. The pre-coordinated knowledge is modeled using a semantic network notation. It has synonym, multi-axial, and hierarchical relationships. The post-coordinated knowledge can be modeled either statistically or probabilistically. We created the statistical model using the concept of link frequency that can be obtained from the processing of the documents space. We used the nave Bayes network as a probabilistic model of the post-coordinated knowledge. The nave Bayes network model has a simple structure by its independence assumption, while providing simplistic but acceptable results with its simple structure for calculating the joint probability distribution, that is post priority of disease.

PAGE 72

60 We describe the Knowledge Conversion Engine (KCE) in the next chapter. The KCE handles the conversion of knowledge to quantitative values. We call the conversion process knowledge reduction. d1 t1 f1 e1 m1 P(d1), P(~d1)P(t1|d1)P(t1|~d1)P(m1|d1)P(m1|~d1)P(e1|d1)P(e1|~d1)P(f1|d1)P(f1|~d1) Figure 5-8. PCKB component structure and probability estimation.

PAGE 73

CHAPTER 6 KNOWLEDGE CONVERSION ENGINE (KCE) The Knowledge Conversion Engine (KCE) converts the Support Vector Machine (SVM) document vector to a conceptual document vector reflecting the knowledge of the SNOMED Semantic Network Knowledge Base (SNN-KB). We start our discussion of the process with a description of the SVM document vector. 6.1 Support Vector Machine Document Vector The best-known model in information retrieval is the Vector Space Model (VSM) (Salton et al. 1989). In the VSM, documents and queries reside in vector space. In this space, each document can be represented as a linear combination of term vectors. The definition of the vector space model follows: Definition 6.1: A document vector for a document is jd jd= Tjtjjwww),...,,(,,2,1 where 0,jiw is a weight associated with the pair () where ik jd ik is an index term, is a document, and jd t is the number of index terms in the whole system. Definition 6.2: The set of all index terms K is K = {} where tkk,...,1 t is the number of index terms in the whole system. Normally the index terms are words contained in the document. The set is usually confined to only the significant words by eliminating common functional words called stopwords. The VSM uses the term frequency and the inverse term frequency as a weighting scheme associated with the document. Definition 6.3: The weight s 0,jiw i 61

PAGE 74

62 ijijiidftfw ,, where jitf, is the term frequency of term i in document j and iinNidflog (the inverse document frequency) where N is the number of documents in the collection and in is the document frequency of term i The document frequency is the number of documents in which the term occurs. 6.2 Conceptual Document Vector The SVM document vector uses term frequency and inverse document frequency as a conceptual imbuement to the information retrieval model. There has been an attempt to use phrases as index terms instead of words (Mao and Chu, 2002), which gives a conceptual similarity of phrasal words in the retrieval model. They reported a 16% increase of retrieval accuracy compared to the stem-based model. In the Vector Space Model, term vectors are pair-wise orthogonal meaning that terms are assumed to be independent. There was an attempt to incorporate term dependencies, which gives semantically rich retrieval results (Billhardt et al. 2004, p. 239). They used a term context vector to reflect the influence of terms in the conceptual description of other terms. The definition of a term context vector follows: Definition 6.4: The set of term context vectors T is nnnnnncccccccccT212221212111 where n is number of terms and ikc represents the influence of term on term t kt i Definition 6.5: The term context vector it is the ith column of matrix T where Tiniiiccct,,,21 and where n is number of terms and

PAGE 75

63 ikc represents the influence of term on term t kt i The Knowledge Conversion Engine (KCE) converts relationships within the SNN-KB into a term context vector. In the following, we discuss how the elements of matrix T can be obtained from domain-specific knowledge base representation. 6.3 KCE: Knowledge Reduction jd Tjtjjwww),...,,(,,2,1 Computer friendly statistical modelComputationally efficientHuman friendly graph representationComputationally complexReduction ofKnowledge Figure 6-1. Knowledge reductions There are two types of knowledge to convert: pre-coordinated and post-coordinated knowledge. We reduce the dimension of the knowledge of the pre-coordinated knowledge to a conceptual document vector. The form of knowledge expressed by a graph (in our case, a semantic network) is a human friendly form. But it is computationally complex. We convert that knowledge into a computer friendly and efficient statistical form. The concept of knowledge reduction is shown in Figure 6-1.

PAGE 76

64 6.4 KCE: Conversion of Pre-Coordinated Knowledge Three types of relationships exist within the SNN-KB model representing SNOMED. In the first type, the hierarchical topology relationship, each node has attributes denoting its characteristics on the hierarchical tree. i i j j L(i)=0D(i)=6L(j)=2D(j)=0 Figure 6-2. Attributes of the SNN-KB hierarchical topology relation L(i) is the level of term i in a knowledge tree. D(i) is the number of descendents of term node i in the tree. The term influence between i and j is inversely proportional to the distance, which is the difference of the levels. Having many descendents means that a node is a more general term than some node having a smaller number of descendents. So term influence is inversely proportional to the number of descendents. Thus we can calculate the SNN-KB hierarchical topology relationship between the two terms i and j as: Definition 6.6: from the SNN-KB hierarchical topology is ijc )()(1log),(1)(jDiDjidShtCcij where )(ShtC is the coefficient for the SNOMED hierarchical topology relation and )()(,jLiLjid where L(i) is level of node i and L(j) is level of node j, D(i) is number of descendents of node i, and

PAGE 77

65 D(j) is number of descendents of node j. For the synonym relations: Definition 6.7: from the SNN-KB synonym relation is ijc )(SsCcij where C(Ss) is the coefficient for the SNOMED synonym relationship. For the multi-axial relations: Definition 6.8: from the SNN-KB multi-axial relation is ijc )(SmCcij where C(Sm) is the coefficient for the SNOMED multi-axial relationship. The value of C(Sht), C(Ss), and C(Sm) should be optimized by simulation. 6.5 KCE: Generating the Conceptual Document Vector By converting the SNOMED knowledge and domain-specific knowledge to the term-relation matrix T defined in Definition 6.4, we can transform each initial document vector jd= Tjtjjwww),...,,(,,2,1 into a conceptual document vector Tjtjjiccccd),,,(,,2,1 using the equation in Definition 6.9 (Billhardt et al. 2004, p. 240). Definition 6.9: icd from id (Definition 6.1) and it (Definition 6.5) is njijnjjjijiwttwcd11 where it is the term context vector of term and jt jt is the length of vector jt The division of the elements in the term context vectors by the length of the vector is a normalization step.

PAGE 78

66 6.6 KCE: Conversion of the Post-Coordinated Knowledge Post-coordinated knowledge can be obtained from a users document (i.e., a patients report) after processing all documents in the system. This knowledge cannot be obtained from the pre-defined SNOMED knowledge base. This knowledge contains noise because the coding ability including the correctness of the coding of the patient report varies from pathologist to pathologist. We can define two kinds of models: statistical and probabilistic. 6.6.1 Statistical Model of Post-Coordinated Knowledge To compute the statistical model, we first introduce the link frequency (lf) to express the closeness of the relation between terms. Definition 6.10: The link frequency is lf the number of linkages accumulated from all system document domain-specific knowledge. The domain-specific knowledge in pathology consists of the multi-axial relations that have more importance on the disease axis. In the knowledge of multi-axial disease centered relationships, relations between axis terms can be divided into two types. We call relations including a disease as a D-type (Disease related type) relation and the other relations as an M-type (Multi-axial related type) relation. The reason for separating the relations is that the disease axis related relations have more meaning than the other relations. Figure 6.3 shows an example of this domain-specific knowledge model showing newly defined attributes. Figure 6.3 describes a relation between disease i and other axis terms: j1, j2, j3, and j4. The relation between i and j1 was found 230 times, which is the link frequency from the start of the system until now (i.e., since the start of data in the database). Because the

PAGE 79

67 value of link (i, j1) is greater than the other D-relation links, it is more important than the other links. j2 j2 j1 j1 i i j3 j3 j4 j4 Disease axis termlf=230lf=140lf=54lf=78lf=376lf=1480lf=378Other axis terms Type-M relationsType-D relations Figure 6-3. Example of Domain-Specific Knowledge relations For the Type-M relations, we can define the term to term relation factor, as: ijc Definition 6.11: from the Domain-specific Type-M relation is ijc 12)1()(nnlfDMClfcijCCCijij where C(DM) is the coefficient of the Domain-specific M-type relation, ijlf is the link frequency between i and j, and Clf is the link frequency of other relations other than i and j. Figure 6-4 shows the conversion concepts of type-M relations. The type-M relation is a sum of the importance of link and averaged influences from other links. If we look at type-D relations, one disease term has several relations with type-M nodes. So, we have to consider the influence on one type-D relation to the other type-D

PAGE 80

68 relations. For example, if we calculate the relation factor between node i and j1, we must consider the influences from other relations: (i, j2), (i, j3), and (i, j4) to the relation (i, j1). TLung lf=376lf=1480lf=378Other axis terms Type-M relations MGranuloma EM.tuberculosis FFever T M P F E cMT 12)1(nnlflfcMTcccMTMT Figure 6-4. Conversion of type-M relations For the D-type relations: Definition 6.12: from the Domain-specific Type-D relation, where node i is disease term and node j is other term, is ijc 1)()(iijCCijijNlfDDnClfDDCcw here C(DD) is the coefficient of the Domain-Specific D-type relation, ijlf is the link frequency between i and j, C(DDn) is the coefficient of the Domain-Specific Disease Neighbor relation, ijlf is the link frequency other than i and j, and N i is the number of axis other than diseases in the knowledge-base. The statistical model of the post-coordinated knowledge can be applied to the conceptual matrix. This means the knowledge is applied to the document vector generally regardless of each documents situation.

PAGE 81

69 6.6.2 Probabilistic Model of Post-Coordinated Knowledge We defined the nave Bayes network model of the post-coordinated knowledge in Section 5.3. After processing documents for post-coordinated knowledge (PCK), we have n documents and m PCKs. Each PCK has a specific form shown in Figure 5.6. The object of inference in the knowledge-based information retrieval model is to find a disease from the given findings (combinations of T, M, E, and F). Each document does not contain complete PCKs normally. Because of the lack of expert knowledge, it is impossible to write a complete form of the PCKs in a patients report. So we must estimate what kind of disease is most likely from the given findings in the document. This is the key to improving the knowledge enhancement of the retrieval process. We modeled the PCKs using nave Bayes in Section 5.3. We can define the posterior probability that we are attempting to calculate as: ),,,|(femtDP where D is the set of diseases that has a relationship with the given findings (t, m, e, and f) found by searching PCKs. The posterior probability can be solved by Bayes theorem. ),,,()|,,,()(),,,|(femtpDfemtPDPfemtDP In practice, we are only interested in the numerator of above fraction, since the denominator does not depend on D and the values of the t, m, e, and f that are given, so the denominator is constant. By the independence assumption, we can rewrite the fraction as: niiDFPDPZfemtDP1)|()(1),,,|(

PAGE 82

70 where F i is set of findings. The post-coordinated knowledge has specific relations with the individual documents. Actually, the individual knowledge is defined from the specific contents of each document, so we cannot use knowledge reduction in this case. Knowledge reduction handles general knowledge conversion cases. So we have to apply the post-coordinated knowledge to each document: more specifically to each individual document vector. We can classify several cases for conversion of post-coordinated knowledge. Refer to Figure 5-5 for the classification of post-coordinated knowledge. We use PCKB-a for a one axis relation, PCKB-b for a two axes relations, PCKB-c for a three axes relation, and PCKB-d for a four axes relationship. Case 1: The document contains all four axes, for example (t, m, e, and f). We must find the probability of d based upon the existence of (t,m,e,f). This is performed by searching PCKB-d. Searching PCKB-a, PCKB-b, or PCKB-c is not necessary because those have less information. We can obtain only one component of knowledge from PCKB because with the five axes of information, the knowledge is complete and unique. Case 2: The document contains three axes all except d. Figure 6-5 shows an example of this case. Here, we must compute the probability of each possible diseases, then another axiss, i.e., P(d1|t, m, e) (1) P(d2|t, m, e) (2) after finding the possible post-coordinate knowledge from PCKB-c and PCKB-d. Searching PCKB-d is required because PCKB-c can be inclusive knowledge of PCKB-d. d1 t f1 e m f2 d2 Figure 6-5. Examples of case2

PAGE 83

71 We know already P(d1), P(d2), P(t|d1), P(m|d1), P(e|d1), P(f1|d1), P(t|d2), P(m|d2), P(e|d2), and P(f2|d2). By the nave Bayes theorem, the posterior probability (1) and (2) can be calculated and compared by: )1|()1|()1|()1()1|()1(1),,|1(1dePdmPdtPdPdFPdPZemtdPnii )2|()2|()2|()2()2|()2(1),,|2(1dePdmPdtPdPdFPdPZemtdPnii Then, we can augment the document vector according to the relative normalized value of P(d1|t, m, e) and P(d2|t, m, e) with some coefficient. The complexity of this algorithm, is O(mn) where n is the number of documents and m is a count of the post-coordinate knowledge. Case 3 is the case when two axes relations found in document and case 4 is the case when one axis relation found in document. The calculation is as straightforward as in case 2. 6.6 SVM IR Engine: Document Retrieval After the process of converting the document vector to the conceptual document vector, the system can start accepting queries. A query is expressed identical to a document vector where the query terms are the vector elements. The query vector q is compared with the conceptual document vector icd using the cosine similarity measure. Definition 6.14: The similarity between q and icd is niniiiniiiiiicdqcdqcdqcdqcdq11221.),cos(

PAGE 84

72 The similarity measure produces a ranked list of relevant documents related to the query. 6.7 Summary We described the details of Knowledge-base Conversion Engine (KCE). The KCE reduces knowledge expressed by a semantic network or a Bayesian network into quantitative values to provide efficiency in the retrieval process. Conversion of the knowledge is called knowledge reduction because the reduction process reduces the graphical knowledge into a two-dimensional value representing the number of relations between the two terms. The conversion of Bayesian network knowledge is done by directly applying the inferred certainty value into a document vector. This process applies the knowledge into the individual documents, which is called personalized knowledge application, while the conversion of pre-coordinated knowledge is general application of knowledge. In the next chapter, we describe the result of performance evaluation of the developed the knowledge base information retrieval model.

PAGE 85

CHAPTER 7 PERFORMANCE EVALUATION 7.1 Simulation Parameters In our experiment, we used recall and precision metrics for evaluation of the performance as explained in Section 2.4. We can consider the gain of performance if the recall-precision graph shape goes to upper right direction shown in Figure 7-1, because in ideal case, the precision should be 100% when the recall is 100%. RecallPrecision 1001050060 PerformanceDecreasingPerformanceIncreasing Figure 7-1. Performance evaluation metrics To calculate precision and recall, we must know the exact relationship between each document and the query. An expert should determine this, so it is impossible to evaluate the relevancy between documents and a query if the set is big. In our case, the total number of documents is nearly one half million. We selected 2000 case documents signed by a top expert, because those documents should have a low error rate in describing post-coordinate knowledge. Then we selected 261 cases randomly among 2000 cases because we need to reduce the size of set to be able to examine relevancy by human expert. The selected 261 cases were examined for their relevancy with queries 73

PAGE 86

74 membranous nephropathy lupus and nephrotic syndrome. Our expert rated the relevancy between each document and the query as Positive, Neutral, and Negative. In this chapter, we call query membranous nephropathy lupus as Q1 and nephrotic syndrome as Q2. Table 7-1 shows the result of evaluation for the 261 documents. Table 7-1. Relevancy check result of 261 simulation documents Query # of positive # of neutral # of negative Total relevant (positive+neutral) Q1 24 95 142 119 Q2 23 90 148 113 7.2 Simulation Result 7.2.1 Performance Evaluation with Pre-Coordinated Knowledge Figure 7-2 shows the result of the query membranous nephropathy lupus on the positive cases. This graph shows some degradation of performance for the knowledge based information retrieval (KBIR) model compared with the support vector machine (SVM). We can think of the KBIR having the same effect as query expansion. The KBIR expands the document vector instead of the query vector. If the knowledge has synonyms, the KBIR expands the document vector to include synonyms of the query membranous nephropathy lupus. This causes an expansion to a somewhat broader range of knowledge. For example, membranous can be expanded to a more general term, so the degradation on the positive case may be caused by a general expansion of the knowledge of KBIR. This can be explained more by looking at the results of query 1 if we included the neutral cases in the performance evaluation as shown in Figure 7-3.

PAGE 87

75 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision KBIR VSM Synset CrossRef SomeRel Figure 7-2. Comparison of performance for query1 on positive cases. 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision KBIR VSM Synset CrossRef SomeRel Syn+CrossRef Figure 7-3. Evaluation results of query 1 including the neutral cases.

PAGE 88

76 70.0%75.0%80.0%85.0%90.0%95.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision KBIR VSM Synonym CrossRef SomeRel Figure 7-4. Evaluation results for query 2 for the positive cases 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision KBIR VSM Synonym CrossRef SomeRel Figure 7-5. Evaluation results for query 2 including the neutral cases

PAGE 89

77 These results show big gain in performance when compared with the degradation that occurs with only the positive cases. If we look at the result more generally, meaning there is an importance to the neutral cases, the performance evaluation result shows promising result. The gain can be explained by the expansion of knowledge in the document vector. If we look at the result of VSM, the resulting documents only have to contain one of the query terms: membranous, nephropathy, or lupus. But KBIR retrieves some documents that do not contain any query words because the document vector was extended to contain terms related to the existing terms in these documents. This increases the recall rate. If we look at precision, this starts to make sense when we consider the results more generally. Figure 7-4 is the result of query 2, nephrotic syndrome on just the positive cases. When this is contrasted with the evaluation of query 1 on positive cases, the results show a performance gain. This can be explained by the characteristics of KBIRs knowledge management. Because the number of terms in query 2 is smaller than in query 1, the amount of expanded knowledge for query 2 is less than for query 1. This means that knowledge expansion for queries having fewer query terms tends to have smaller error rates compared to queries having many terms. If we look at the performance evaluation results of query 2 including the neutral cases shown in Figure 7-5, they show a lower performance gain when compared to the results of query 1. This can be explained also by the small expansion of knowledge caused by lower number of terms in the query.

PAGE 90

78 If we look at the effects of each relationship on KBIR performance, we can say the result of KBIR performance is the sum of each relation: synonym, cross reference, and some relation. Normally, synonym relations do not show a significant contribution but cross reference relations (i.e., relations between SNOMED axes) show a significant contribution in performance. This can be explained as each documents concept can be expressed by a SNOMED equation, so the relationship between concepts is more important than just the synonym relations between terms. Table 7-2 shows quantitative values of performance gain for the pre-coordinated knowledge addition compared to the VSM method. Table 7-2. Value of performance gain of pre-coordinated knowledge compared to VSM Query Performance gain (%) Query 1 39.6 Query 2 20.6 Average 30.1 7.2.2 Performance Evaluation with Nave Bayes Post-Coordinated Knowledge Figure 7-6 shows the performance gain when we use the nave Bayes post-coordinated knowledge for query1 and Figure 7-7 for query 2. Table 7-3 shows the quantitative value of performance gain compared to VSM and pre-coordinated knowledge. Table 7-3. Value of performance gain of post-coordinated knowledge Query Performance gain (%) Compared to pre-coordinated knowledge Performance gain (%) Compared to VSM Query 1 7.0 47.0 Query 2 8.2 28.8 Average 7.6 37.9 The results show nearly the same percentage of improvement compared to the pre-coordinated knowledge case and different gain compared to the VSM case.

PAGE 91

79 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision With PK Without PK VSM Figure 7-6. Evaluation results of query 1 including post-coordinated knowledge 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision With PK Without PK VSM Figure 7-7. Evaluation results of query 2 including post-coordinated knowledge

PAGE 92

80 The reason is straightforward for the effects of knowledge application of our model explained in previous section. 7.2.3 Performance of Statistical Post-Coordinate Knowledge Model There is no significant performance improvement on this model as seen on Figure 7-8. We thought the statistical model of post-coordinated knowledge is general knowledge that can be applicable to all documents regardless of its own semantics of each document. The result shows the assumption is incorrect. 0.0%10.0%20.0%30.0%40.0%50.0%60.0%70.0%80.0%90.0%100.0%0.0%20.0%40.0%60.0%80.0%100.0%RecallPrecision With PK Without PK SVM Figure 7-8. Evaluation results of query 1 including statistical post-coordinated knowledge 7.3 Summary We showed the results of performance evaluation for our knowledge-based information retrieval model showing the effects of each pre-coordinated knowledge and post-coordinated knowledge.

PAGE 93

81 The results show a nearly 30% increase for pre-coordinated knowledge application and 37% increase for post-coordinated knowledge application compared to VSM. These increases occur even though the real-time speed of processing is comparable to VSM. We applied the statistical model of post-coordinated knowledge to all documents evenly by inserting computed relations into the term-context matrix. We assumed the statistical post-coordinated knowledge is general knowledge that can be applied evenly. But from the simulation results of the statistical model, we can conclude that the post-coordinated knowledge is personalized knowledge that should be applied to each document separately. We applied the nave Bayes model based knowledge to each documents term vector separately. The next chapter concludes our research summarizing contributions and identifying future work.

PAGE 94

CHAPTER 8 CONCLUSION In this dissertation, we have shown significant progress towards developing an information retrieval model augmented by a knowledge base. We created a knowledge based information retrieval (KBIR) model showing meaningful performance gain while providing same speed performance in the retrieval process. We summarize our contributions in Section 8.1 and discuss directions for future work in Section 8.2. 8.1 Contributions The objective of this dissertation was to design an intelligent information retrieval model producing knowledge infused answers to users by incorporating a domain-specific ontology in the knowledge-base using a computationally efficient knowledge conversion method. The main contributions of the dissertation to information retrieval research are as follows: Knowledge reduction to statistical model: The developed information retrieval model is a knowledge-based information retrieval model. Unlike the other models, which perform an ontology level information retrieval task such as an ontology comparison and an ontological query expansion, the proposed model reduces the knowledge level represented by the knowledge base to a statistical model such as the vector space models document vector shown in Figure 8-1. We used semantic networks for pre-defined knowledge and nave Bayes networks for post-coordinated knowledge. Those graphical knowledge representations are human friendly and easily understandable to human but computationally complex. The reduced statistical form of knowledge, such as a 82

PAGE 95

83 conceptual document vector, is not human friendly but is computer friendly and computationally efficient. DocumentVector DocumentVector KnowledgeConversion Engine ConceptualDocumentVector ConceptualDocumentVector KnowledgeBase Figure 8-1. Knowledge reduction to statistical model Query QueryVector VSM IR engine RankedResult Documents Black Box For Applying KnowledgeOFF-LINECALCULATION Figure 8-2. Off-line application of knowledge Off-line application of knowledge: Using knowledge reduction enables the off-line processing of the application (calculation) of knowledge to the information retrieval procedure shown in Figure 8-2. Only the conceptual document vector, which can be

PAGE 96

84 obtained from the document vector and the knowledge base, is involved in the on-line process of producing ranked results by comparing a users query and the documents. Inverse query expansion: The result of our knowledge-based information retrieval model is very similar to that of query expansion or latent semantic. Unlike those models, which calculate part of the knowledge during the retrieving process, our model does its processing offline, giving the same effect with a lower computational burden. Applicability to general open domain: Even if the proposed model uses domain-specific knowledge, this model can be used in an open-domain application if some types of knowledge bases are supported. One possible candidate for the open domain knowledge base is WordNet, which has a thesaurus and relations from the natural language domain. Flexibility on the knowledge representation: We defined some examples of knowledge reduction methods using a semantic network. The semantic network is an example of a knowledge representation, which is one of artificial intelligences field handling ontologies. Our model has flexibility on the type of knowledge representation if we can define the knowledge reduction scheme of the selected knowledge representation model. In our model, we used a nave Bayes network for representing post-coordinated knowledge. It has classification ability with less computational complexity and a reasonable approximation of conditional independence. 8.2 Future Work One task that needs completing is the modeling of the hierarchical knowledge. To adequately model the hierarchy in the Pathology domain requires that we refine the hierarchical relationship by looking at the SNOMED book. The reason is that the database storing the SNOMED notations is incomplete in exactly defining the

PAGE 97

85 hierarchical relationships. We need to make complete sets of the hierarchy of over 50,000 semantic relations existing in the SNOMED book to apply the hierarchy in our knowledge-base IR model. There is a possibility to use the current version of SNOMED, SNOMED-CT, that provides a more profound and accurate set of relationships in the pathology domain. This should be handled as a separate project because of the size and depth of the work. We can induce the result when we add the hierarchical knowledge in our model by looking at the results of other relation additions. The trends of relation additions show a higher degradation of performance if the relations are more general. We think that the hierarchical relationships will add a larger number of relations to the term matrix than the other relations, resulting in some degradation on the precision, but with a gain in recall. A second extension of this work is to apply our model to the open domain information retrieval process. Using WordNet as a knowledge source, we can see if there is a performance gain in general domain information retrieval. Extracting knowledge automatically from given documents to use as a knowledge source for the information retrieval process is a possible approach towards applying our model to the general open domain. Finally, we used the nave Bayes network for modeling post-coordinated knowledge. The nave Bayes model assumes independence among findings. Several Bayesian network based models exist providing dependence model among findings. Even though several papers identify that the nave Bayes model shows acceptable performance in its simple form, it would be worthwhile to compare the performance between the nave Bayes and other models providing the dependency relations between findings.

PAGE 98

APPENDIX A PRIMARY TERMS WHICH ARE THE BASIS FOR THE DB ATTRIBUTE Table A-1. Primary terms for APDB Terms Roles Etc SURG PATH NO SURGICAL PATHOLOGY NO# ACC. # ACC# CYTOLOGY NO Format:NNNN-YY-T NNNN: Serial number distinct in one year, digit width may vary YY: year expressed in two digit T: Type = { C, S, O, G, M } Type C Consultation Rpt S in-house surgical Rpt This number also shown at the end of the line having format: YYTNNNNN###YYMMDD NAME Patient name Format: Last, First, Middle, Suffix TEST NO Test number SPECIMEN NO SPECIMEN Specimen number MED REC NO Medical Record # 6 digit unique number of each hospital format: NN-NN-NN may vary ROOM WARD Room number WARD Patient location AGE Age of patient Format: NN [Y|M|D] NN number Y represent year M month D day SEX Sex of patient Format: {M|F} DATE Service Date Service date Format: Month Day, Year Example: JANUARY 07, 1981 PHYS PHYSICIAN Referring Physician Surgeon Referring Physician or Surgeon REPORT TYPE Example: S1 Surgical SERVICE Date obtained Date received Date Obtained Date obtained Date Received Date received 86

PAGE 99

87 Table A-1 Continued Terms Roles Etc HISTORY CLINICAL HISTORY Clinical history Specimen(s) submitted/ Procedures ordered Long text Specimen submitted GROSS DISCRIPTION MICROSCOPIC DESCRIPTION MICROSCOPTIC DESCRIPTION Light Microscopy Immunofluorescence microscopy Electron microscopy Other tests: e.g. included cytogenetics, molecular biology, or flow cytometry data DIAGNOSIS Bone marrow, aspiration: No lymphoma detected COMMENT PATHOLOGIST RETRIEVAL CODES Diagnostic/Retrieval codes Modifier codes Transaction codes: JP/whd Date of transcription: 03/23/99 Electronic signatures Date Electronically signed out

PAGE 100

APPENDIX B SNOMED STATISTICS Table B-1. Partial list of T code Name Number P(code/total disease) P(code/documents) T8X330 142850 0.203217335 0.491999201 T8X310 64010 0.091060144 0.220461105 T00XX0 53701 0.076394639 0.184955191 T83000 33989 0.048352496 0.117063779 T8X210 22408 0.031877452 0.077176886 T0X000 16728 0.023797127 0.057614019 T83300 14706 0.020920645 0.050649914 T82 14232 0.020246336 0.049017379 T83320 13621 0.019377132 0.046912993 T74000 13125 0.018671526 0.045204687 T84000 12585 0.017903326 0.043344837 T8X 11307 0.016085253 0.038943192 T86800 8341 0.011865844 0.028727794 T2Y030 7327 0.010423335 0.025235409 T88100 7299 0.010383502 0.025138972 T06000 6825 0.009709194 0.023506437 T01000 6449 0.009174299 0.022211431 T8X320 5778 0.008219739 0.019900395 T7X100 5648 0.008034802 0.019452653 T56000 5495 0.007817146 0.018925696 T2Y414 5407 0.007691958 0.018622609 T67000 4980 0.007084511 0.017151950 T71000 4853 0.006903841 0.016714541 T77100 4597 0.006539658 0.015832834 T63000 4457 0.006340495 0.015350651 T32010 4185 0.005953550 0.014413837 T80100 4137 0.005885265 0.014248517 T6X940 3679 0.005233718 0.012671089 T86120 3609 0.005134136 0.012429997 T04030 3543 0.005040245 0.012202682 T86110 3541 0.005037400 0.012195794 T04020 3523 0.005011793 0.012133799 T57000 3514 0.004998990 0.012102801 T0X00 3301 0.004695978 0.011369194 T08000 3207 0.004562254 0.011045442 T82900 2875 0.004089953 0.009901979 T81000 2863 0.004072882 0.009860649 T88960 2746 0.003906439 0.009457682 T66000 2726 0.003877987 0.009388798 88

PAGE 101

89 Table B-2. Partial list of M code Mcode Number P(M/Total M) P(M/documents) M09460 92173 0.154946459 0.317459169 M40000 59780 0.100492545 0.205892280 M00100 37701 0.063376872 0.129848526 M73320 29314 0.049277994 0.100962300 M09010 15825 0.026602451 0.054503937 M43000 14424 0.024247315 0.049678659 M69780 12485 0.020987779 0.043000420 M42100 12069 0.020288466 0.041567647 M4000 9195 0.015457159 0.031669112 M72020 9132 0.015351253 0.031452130 M09450 7753 0.013033100 0.026702624 M74006 7435 0.012498529 0.025607379 M0946 7399 0.012438012 0.025483389 M76720 7300 0.012271589 0.025142416 M81403 7150 0.012019433 0.024625791 M80703 6495 0.010918352 0.022369862 M49000 5736 0.009642443 0.019755740 M41000 5717 0.009610503 0.019690301 M72000 5611 0.009432313 0.019325219 M7332 5211 0.008759897 0.017947552 M72600 4652 0.007820196 0.016022263 M73225 4363 0.007334376 0.015026899 M73220 4317 0.007257048 0.014868467 M54000 4259 0.007159547 0.014668706 M69000 4141 0.006961185 0.014262294 M55600 4134 0.006949418 0.014238185 M74007 4126 0.006935969 0.014210631 M72120 3864 0.006495537 0.013308260 M69700 3668 0.006166053 0.012633203 M74030 3613 0.006073596 0.012443774 M69890 3428 0.005762604 0.011806603 M09030 3402 0.005718897 0.011717055 M74008 3379 0.005680233 0.011637839 M08900 3088 0.005191050 0.010635587 M38000 2842 0.004777514 0.009788322 M69705 2827 0.004752299 0.009736659 M6978 2526 0.004246306 0.008699965 M09000 2486 0.004179064 0.008562198 M79310 2449 0.004116866 0.008434764 M79320 2352 0.003953805 0.008100680 M45020 2292 0.003852943 0.007894030 M80702 2165 0.003639451 0.007456621 M55401 2102 0.003533545 0.007239638 M33400 2013 0.003383933 0.006933107 M31600 1944 0.003267941 0.006695460 M50000 1940 0.003261217 0.006681683 M28000 1938 0.003257855 0.006674795 M72200 1801 0.003027552 0.006202944

PAGE 102

90 Table B-3. Partial list of E code E code Number P(E/ total Es) P(E/ documents) E1851 31930 0.232968765 0.109972240 E1000 27453 0.200303523 0.094552706 E4080 11679 0.085212722 0.040224422 E100 9176 0.066950247 0.031603673 E185 8741 0.063776385 0.030105460 E8508 8570 0.062528729 0.029516508 E4432 7443 0.054305873 0.025634932 E8500 4736 0.034554966 0.016311573 E3345 4336 0.031636472 0.014933906 E850 4158 0.030337743 0.014320845 E443 4150 0.030279373 0.014293291 E9320 3173 0.023150952 0.010928341 E408 2790 0.020356494 0.009609225 E4433 2107 0.015373166 0.007256859 E9038 911 0.006646870 0.003137636 E1006 689 0.005027106 0.002373031 E932 664 0.004844700 0.002286927 E334 596 0.004348556 0.002052723 E1370 321 0.002342091 0.001105577 E4000 249 0.001816762 0.000857597 E1070 229 0.001670838 0.000788714 E93201 210 0.001532209 0.000723275 E3213 175 0.001276841 0.000602729 E4331 144 0.001050658 0.000495960 E1360 131 0.000955807 0.000451186 E3211 128 0.000933918 0.000440853 E5911 118 0.000860956 0.000406412 E903 118 0.000860956 0.000406412 E4061 113 0.000824474 0.000389191 E2800 95 0.000693142 0.000327196 E3223 72 0.000525329 0.000247980 E107 67 0.000488848 0.000230759 E3000 61 0.000445070 0.000210094 E9012 60 0.000437774 0.000206650 E280 59 0.000430478 0.000203206 E33451 55 0.000401293 0.000189429 E0001 54 0.000393997 0.000185985 E9068 49 0.000357515 0.000168764

PAGE 103

91 Table B-4. Partial list of F code F codes Number P(F/total F) P(F/documents) F31000 26518 0.598897873 0.091332410 F31170 4033 0.091083608 0.013890324 F03430 2392 0.054022314 0.008238447 F31680 1400 0.031618411 0.004821833 F30410 1096 0.024752699 0.003774807 F44100 1008 0.022765256 0.003471720 FXXXXX 878 0.019829261 0.003023978 F3168 842 0.019016216 0.002899988 F0343 527 0.011902073 0.001815076 F47610 485 0.010953521 0.001670421 F44110 430 0.009711369 0.001480992 F47680 430 0.009711369 0.001480992 F47160 423 0.009553277 0.001456882 F3041 271 0.006120421 0.000933369 F30060 241 0.005442884 0.000830044 F3117 218 0.004923438 0.000750828 FXXXX 214 0.004833100 0.000737052 F47510 208 0.004697592 0.000716387 F10363 166 0.003749040 0.000571732 F47120 161 0.003636117 0.000554511 F31100 158 0.003568364 0.000544178 F3006 151 0.003410271 0.000520069 F3100 115 0.002597227 0.000396079 F44200 100 0.002258458 0.000344417 F30710 96 0.002168120 0.000330640 F44150 66 0.001490582 0.000227315 F97010 66 0.001490582 0.000227315 F53812 53 0.001196983 0.000182541 F31600 43 0.000971137 0.000148099 F44220 42 0.000948552 0.000144655 F02120 40 0.000903383 0.000137767 F44170 29 0.000654953 9.98808E-05 F08240 26 0.000587199 8.95483E-05 F61340 26 0.000587199 8.95483E-05 F44040 25 0.000564614 8.61042E-05 F82600 25 0.000564614 8.61042E-05 F01050 24 0.000542030 8.26600E-05 F12206 24 0.000542030 8.26600E-05

PAGE 104

92 Table B-5. Partial list of D code D code Number P(D/total D) P(D/documents) D0445 4212 0.382874284 0.014506830 D6255 328 0.029815471 0.001129687 D3872 269 0.024452323 0.000926481 D4690 202 0.018361967 0.000695722 D6216 201 0.018271066 0.000692277 D8004 195 0.017725661 0.000671612 D6214 178 0.016180347 0.000613062 D67020 176 0.015998546 0.000606173 D6259 165 0.014998636 0.000568287 D3071 154 0.013998727 0.000530402 D05204 150 0.013635124 0.000516625 D5204 144 0.013089719 0.000495960 D3150 141 0.012817017 0.000485627 D3867 130 0.011817108 0.000447742 D044 117 0.010635397 0.000402967 D6505 108 0.009817289 0.000371970 D0828 107 0.009726389 0.000368526 D67022 80 0.007272066 0.000275533 D2381 79 0.007181165 0.000272089 D7100 66 0.005999455 0.000227315 D6507 65 0.005908554 0.000223871 D6730 65 0.005908554 0.000223871 D0110 64 0.005817653 0.000220427 D0557 61 0.005544950 0.000210094 D6702 58 0.005272248 0.000199762 D6218 48 0.004363240 0.000165320 D3890 46 0.004181438 0.000158432 D0188 44 0.003999636 0.000151543 D0550 42 0.003817835 0.000144655 D67021 41 0.003726934 0.000141211 D0316 40 0.003636033 0.000137767 D8081 40 0.003636033 0.000137767 D71000 39 0.003545132 0.000134322 D0378 38 0.003454231 0.000130878 D5205 36 0.003272430 0.000123990 D8091 35 0.003181529 0.000120546 D8543 35 0.003181529 0.000120546 D4587 34 0.003090628 0.000117102 D6217 34 0.003090628 0.000117102

PAGE 105

93 Table B-6. Partial list of P code P code Number P(P/P total) P(P/documents) P1140 79609 0.228291790 0.274186660 P1100 66636 0.191089597 0.229505487 P110 40732 0.116805653 0.140287795 P114 34128 0.097867606 0.117542518 P1154 12804 0.036717558 0.044099109 P1143 11259 0.032287019 0.038777872 P1104 9933 0.028484497 0.034210907 P31002 7921 0.022714759 0.027281244 P115 6950 0.019930258 0.023936958 P1251 6455 0.018510765 0.022232095 P176 6131 0.017581642 0.021116186 P1760 5136 0.014728318 0.017689240 P1141 4663 0.013371913 0.016060149 P1107 4279 0.012270730 0.014737589 P1342 4040 0.011585359 0.013914433 P2031 3802 0.010902855 0.013094721 P3100 3374 0.009675495 0.011620618 P5110 3257 0.009339979 0.011217651 PX400 2962 0.008494018 0.010201622 P1101 2898 0.008310488 0.009981195 P3050 2756 0.007903279 0.009492123 P3086 2165 0.006208491 0.007456621 P1341 2138 0.006131064 0.007363628 P1146 1967 0.005640693 0.006774676 P1340 1886 0.005408413 0.006495698 P1148 1271 0.003644800 0.004377536 P1130 1114 0.003194577 0.003836802 P1160 837 0.002400234 0.002882767 P1110 822 0.002357219 0.002831105 PX40 815 0.002337145 0.002806996 P2032 768 0.002202365 0.002645120 P308 754 0.002162218 0.002596902 P3250 733 0.002101997 0.002524574 P0000 672 0.001927070 0.002314480 P142 568 0.001628833 0.001956287 P134 514 0.001473979 0.001770302 P1147 484 0.001387949 0.001666977 P1470 470 0.001347802 0.001618758 P112 438 0.001256036 0.001508545 P1120 408 0.001170007 0.001405220 367 0.001052432 0.001264009 P3120 361 0.001035226 0.001243344 P1102 328 0.000940593 0.001129687 P1155 294 0.000843093 0.001012585 P2021 291 0.000834490 0.001002252 P1144 277 0.000794343 0.000954034 P1600 252 0.000722651 0.000867930 P3130

PAGE 106

LIST OF REFERENCES Barry, S., & Fellbaum, C. (2004). Medical Wordnet: A New Methodology for the Construction and Validation of Information Resources for Consumer Health. In Proceedings of Coling: The 20 th International Conference on Computational Linguistics, pp. 371-382. Berners-Lee, T., Hendler, J., & Lassila, O. (2001).The Semantic Web. Scientific American, 35-43. Billhardt, H., Borrajo, D., & Maojo, V. (2002). A Context Vector Model for Information Retrieval. Journal of the American Society for Information Science and Technology, 53(3), 236-249. Bono, E. D. (1973). Lateral Thinking: Creativity Step by Step. Perennial library, New York. Callan, J. P. (1994). Passage-level evidence in document retrieval. In Proceedings of the Seventeenth annual international ACM SIGIR conference on Research and Development in Information Retrieval, pp. 302-310, Chabat, F., Hansell, D. M., & Yang, G. Z. (2000). Computerized Decision Support in Medical Imaging. IEEE Eng. In Medicine and Biology, 19(5), 89-96. Chai, J. Y., & Biermann, A. (1997). The use of lexical semantics in information extraction. In Proceedings of the Workshop in Automatic Information Extraction and Building of Lexical Semantic Resources, pp.61-70. Cheeseman, P., & Stutz, J. (1996). Advances in Knowledge Discovery and Data Mining Bayesian classification (AutoClass): Theory and results. AAAI Press/MIT Press, p. 61. Menlo Park, CA. Crouch, C. J., & Yang, B. (2000). Experiments in automatic statistical thesaurus contruction. In Proceedings of the 15th annual international ACM SIGIR conference on Research and Development in Information Retrieval, pp. 77-88. Dekang, L.,& Pantel, P. (2002). Concept Discovery from Text. In Proceedings of Coling-02: The 18 th International Conference on Computational Linguistics, pp. 577-583. Detmer, W. M., & Edward, H. S. (1997). Using the Internet to Improve Knowledge Diffusion in Medicine. Comm. Of ACM, 40(8), 101-108. 94

PAGE 107

95 Diez, F. J., Mira, J., Iturralde, E., & Zubillage, S. (1997). DIAVAL, a Bayesian expert system for echocardiography. Artificial Intelligence in Medicine, 10, pp. 59-73. Domingos, P., & Pazzani, M., (1997). On the optimality of the simple Bayesian classifier under zero-one loss. Machine Learning, 29(2-3), 103-130. Draper, B. A., Collins, R. T., Brolio, J., Hanson, A. R., & Riseman, E. M. (1988). Issues in the development of a blackbone-based Schema system for image understanding, Blackboard systems, Reading, MA: Addison-Wesley. Domingos, P., & Pazzani, M., (1997). On the optimality of the simple Bayesian classifier under zero-one loss. Machine Learning, 29, 103-130. Foskett, D. J. (1997). Thesaurus. Readings in Information Retrieval, K. S. Jones, P. Willet, M. Kaufmann Publishers, San Fransisco Friedman, N., Geiger, D., & Goldsmidt, M., (1997). Bayesian network classifiers. Machine Learning, 29(2), 131-163. Fung, R., & Favero, B. D. (1995). Applying Bayesian Networks to Information Retrieval. Communication of the ACM, 38(3), 42-48. Furnas, G. W., Deerwester, S., Dumais, S. T., Landauer, T. K., Harshman, R. A., Streeter, L. A., & Lochbaum, K. E. (1998). Information retrieval using a singular value decomposition model of latent semantic structure, In Proc. of the 11th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 465-480. Gammerman, A., & Thatcher, A. R., (1991). Bayesian diagnostic probabilistic without assuming independence of symptoms. Methods of Information in Medicine, 30(1), 15-22. Gonzalez, A. J., & Dankel, D. D. (1993). The Engineering of Knowledge-based Systems: Theory and Practice, Prentice Hall, Englewood Cliffs, New Jersey. Gorry, G. A., & Barnett, G. O. (1968). Experience with a model of sequencial diagnosis. Comput Biomed Res, 1, 490-507. Gruber, T. R. (1993). A translation approach to portable ontologies. Knowledge Acquisition, 5(2), 199-200. Heckerman, D. E., & Nathwani, B. N. (1992). Toward normative expert systems: Part II. Probability-based representations for efficient knowledge acquisition and inference. Methods Inform Med, 31, 106-116. Inza, I., Larranaga, P., Etxeberria, R., & Sierra, B. (2000). Feature subset selection by Bayesian network-based optimization. Artificial Intelligence, 123(1-2), 157-184.

PAGE 108

96 Kim, S. B., Seo, S. C., & Rim, H. C. (2004). Information Retrieval using Word Senses: Root Sense Tagging Approach. SIGIR, pp.258-265. Kohavi, R., & John, G. (1997). Wrappers for feature subset selection. Artificial Intelligence, 97(1-2), 273-324. Kononenko, I. (1990). Comparasion of inductive and nave Bayesian learning approaches to automatic knowledge acquisition, Current Trends in Knowledge Acquisition, pp. 190-197, IOS Press, Amsterdam. Kononenko, I. (1991). Semi-nave Bayesian classifiers. In proceedings of 6 th European Working Session on Learning, Porto, Portugal, pp.206-219. Langley, P., & Sage, S. (1994). Induction of selective Bayesian classifiers. In proceedings of 10 th conference of Uncertainty in Artificial Intelligence, Seattle, WA, pp.399-406. Leacock, C., Chodorow, M., & Miller, G. A. (1998). Using corpus statistics and WordNet relations for sense identification. Computational Linguistics, 24(1), 147-165. Lee, M. C., Haug, P. J., & Fiszman, M. (2002) MPLUS: A Probabilistic Medical Language Understanding System. ACL, pp.29-36. Liu, H., & Motoda, H. (1998). Feature Selection. Knowledge Discovery and Data Mining, Kluwer, Boston. Mao, W., Chu, W. W. (2002). Free-text Medical Document Retrieval Via Phrase-based Vector Space Model. AMIA Annual Symposium. pp.489-493. Maron, M. E,. & Kuhns, J. L. (1960). On relevance, probabilistic indexing and information retrieval. Journal of the ACM, 7(3), 216-244. Miller, G. A. (1990). Wordnet: An on-line lexical database. International journal of Lexicography, 3(4). 235-312. Minsky, M. (1961). Steps toward artificial intelligence. In proceedings of Institute of Radio Engineers, 49, pp.8-30. Miquelez, T., Bengoetxea, E., & Larranaga, P. (2004). Evolutionary Computation based on Bayesian Classifiers. Int. J. Appl. Math. Comput. Sci., 14(3). 335-349. Mitra, M., Singhal, A., & Buckley, C. (1998). Improving automatic query expansion: In Proceedings of the 21st annual international ACM SIGIR conference on Research and Development in Information Retrieval, pp. 206-214. Monti, S., & Cooper, G. F. (1998). The impact of modeling the dependencies among patient findings on classification accuracy and calibration. In proc on AMIA Symposium, pp. 592-596.

PAGE 109

97 Newell, A, Shaw, J, & Simon, H. (1959). Report on a general problem solving program. In Proc on Conf Information Processing, pp. 256-264. Ohmann, C., Yang, Q., Kunneke, M., Stolzing, H., Thon, K., & Lorenz, W. (1988). Bayes theorem and conditional dependence of symptoms: Different models applied to data of upper gastrointestinal bleeding. Methods of Information in Medicine, 27(2), 73-83. Pazzani, M. (1997). Searching for dependencies in Bayesian classifiers. Learning from Data: Artificial Intelligence and Statistics V, pp.239-248, Springer, New York. Pfitzner, D., Hobbs, V., & Powers, D. (2003). A Unified Taxonomic Framework for Information Visualization. In Proceedings of the Australian symposium on Information visualization, 24, pp. 57-66. Porter, M. (1980). An algorithm for suffix stripping, Program, 14(3), 130-137. Rila, M., Takenobu, T. & Hozumi, T. (1998). The Use of WordNet in Information Retrieval. COLING-ACL, pp.31-37. Robertson, S. ,E. & Sparck, J. K. (1976). Relevance weighting of search terms. Journal of the American Society for Information Science, 27(3), 129-146. Rodger, A. C. (1979). SNOMED II: Systematized Nomenclature of Medicine, College of American Pathologist, Skokie, IL. Salton, G. & Lesk, M. E. (1968). Computer evaluation of indexing and text processing. Journal of the ACM, 15(1), 8-36. Salton, G., Yang, C. S., & Yu, C. T. (1975). A theory of term importance in automatic text analysis. Journal of the American Society for Information Sciences, 26(1), 33-44. Sanderson, M. (2000). Retrieving with good sense. Information Retrieval, 2(1), 49-69. Schweitzer, Y. L., Collet, P., Lutton, E., Prost, T. (2003). Introducing Lateral Thinking in Search Engines with Interactive Evolutionary Algorithms. In Proceedings of the 2003 ACM symposium on Applied computing, pp. 214-219. Shneiderman, B. (1998). Treemaps for space-constrained visualization of hierarchies. [WWW Document] http://www.cs.umd.edu/hcil/tree-maps Shwe, M. and Cooper, G. (1991). An empirical analysis of likelihood-weighting simulation on a large, multiply connected medical belief network. Computers and Biomedical Research, 24, 453-475. Shuang, L., Liu, F., & Yu, C. (2004). An Effective Approach to Document Retrieval via Utilizing WordNet and Recognizing Phrases. ACM SIGIR, pp.266-272.

PAGE 110

98 Starmer, C. F., & Lee, K. L. (1976). A mathematical approach to medical decisions: Application of Bayes rule to a mixture of continuous and discrete clinical variables. Comput Biomed Red, 9, 531-541. Todd, B. S., & Stamper, R. (1994). The relative accuracy of a variety of medical diagnostic programs. Methods Inf Med, 33, 402-416. Vermunt, J. K., & Magidson, J. (2003). Latent class models for classification. Computational Statistics & Data Analysi,s 41, 531-537. Voohees, E. (1994). Query Expansion using lexical-semantic relations. ACM SIGIR, 1994, pp.61-69. Yates, B. R., Neto, R. B. (1999). Modern Information Retrieval. ACM Press. 1515 Broadway, New York.

PAGE 111

BIOGRAPHICAL SKETCH Changwoo Yoon received a B.S. from Sogang University, Korea, in 1990; and an M.S. degree from Pohang University of Science and Technology (POSTECH), Korea, in 1992. He was senior research staff member at the Electronics and Telecommunications Research Institute (ETRI), Korea, for about 7 years from 1992. In 2000, he began work toward a Ph.D. in the Computer and Information Science and Engineering department at the University of Florida, Gainesville, FL. He is interested in artificial intelligence based information retrieval systems and knowledge representation. 99


Permanent Link: http://ufdc.ufl.edu/UFE0011560/00001

Material Information

Title: Domain-Specific Knowledge-Based Informational Retrieval Model Using Knowledge Reduction
Physical Description: Mixed Material
Copyright Date: 2008

Record Information

Source Institution: University of Florida
Holding Location: University of Florida
Rights Management: All rights reserved by the source institution and holding location.
System ID: UFE0011560:00001

Permanent Link: http://ufdc.ufl.edu/UFE0011560/00001

Material Information

Title: Domain-Specific Knowledge-Based Informational Retrieval Model Using Knowledge Reduction
Physical Description: Mixed Material
Copyright Date: 2008

Record Information

Source Institution: University of Florida
Holding Location: University of Florida
Rights Management: All rights reserved by the source institution and holding location.
System ID: UFE0011560:00001


This item has the following downloads:


Full Text












DOMAIN-SPECIFIC KNOWLEDGE-BASED
INFORMATION RETRIEVAL MODEL USING KNOWLEDGE REDUCTION

















By

CHANGWOO YOON


A DISSERTATION PRESENTED TO THE GRADUATE SCHOOL
OF THE UNIVERSITY OF FLORIDA IN PARTIAL FULFILLMENT
OF THE REQUIREMENTS FOR THE DEGREE OF
DOCTOR OF PHILOSOPHY

UNIVERSITY OF FLORIDA


2005





























Copyright 2005

by

Changwoo Yoon

































To my wife Jaesook, my daughter Jenny, my son Juhyung,
and my families, in God with love















ACKNOWLEDGMENTS

I would like to thank my parents for their support. They have provided

unconditional love and support. I greatly thank to all my relatives for their lovely

concerns and prayer.

I would also like to thank to William H. Donnelly for his support and beloved care

during my Ph.D. Without his support as a research assistantship; I would not have

continued my graduate work. I would like to thank my supervisory committee chair

Douglas D. Dankel for his guidance and excellent advice on research.

Finally, and most of all I express my gratitude to my beloved wife, Jaesook. Her

love, support, and prayer have not wavered in this lengthy process. She has undoubtedly

been the single most integral component to my success.
















TABLE OF CONTENTS

page

A C K N O W L E D G M E N T S ................................................................................................. iv

LIST OF TABLES .................................................... ............ ............. .. viii

LIST O F FIG U R E S .... .............................. ....................... ........ .. ............... ix

ABSTRACT .............. .......................................... xix

CHAPTER

1 IN TR OD U CTION ............................................... .. ......................... ..

1.1 Background about Intelligent Information Retrieval...........................................1
1.2 Intelligent Information Retrieval M odel..... .......... ...................................... 3

2 INFORMATION RETRIEVAL ............................................................................6

2.1 Classical Inform ation R etrieval M odels ........................................ .....................6
2.1.1 Boolean M odel .............................. ....... .. .... .............. .. 6
2.1.2 V ector Space M odel ............................................................................. 7
2.1.3 Probabilistic M odel ............................................... .......... ... ...... .. 9
2.2 Alternative Information Retrieval M odels.................................. .............10
2.2.1 Latent Semantic Indexing (LSI) ..... .......... ...................................... 11
2.2.2 Lateral Thinking in Information Retrieval .........................................12
2.3 Information Retrieval Models Involving Reasoning ............... .... ....... .....14
2.4 Evaluating Information Retrieval Performance...............................................15
2.5 U useful Techniques ............................................... ........ .. ............ 17
2.5.1 Stopw ord R em oval ............................................................................. 18
2.5.2 Stemming .................................. ..... ............... 18
2.5.3 Passage R etrieval ................. .......................... ........ ........ .......... 19
2 .5 .4 Q u ery E x p an sion ..................... .. ................................ .. ................ .. 19
2.5.5 U sing P hrase ................... ...... .................. ............ ..... 20
2.6 Enhancement of IR Through Given Knowledge .............. ...........................21
2.6.1 U sing W ordN et............. .... .......................................... .... ........ 21
2.6.2 U sing UM LS, SN OM ED ...................................... ......................... 23
2.7 Summary ...................... ............................23





v









3 KNOWLEDGE REPRESENTATION BY BAYESIAN NETWORK ....................25

3 .1 S em antic N etw ork s ................................................................... .....................2 5
3.2 Probability Principles and Calculus ............................................ ...............27
3.3 B ayesian netw ork ................... .................. ...................... ... ................. 30
3.4 Noisy-OR: Bayesian network inference................................... .................33
3 .5 Q M R -D T m odel............ ....................................................................... .......... 35
3.6 B ayesian C lassifiers.......... ..... ....................................................... ... .... ....... 37
3 .6 .1 N aiv e B ay es ............................................. ................ 3 8
3.6.2 Selective N alive B ayes ........................................ .......................... 39
3.6.3 Seminaive Bayes ....................................... ............................ 39
3.6.4 Tree Augmented Naive Bayes.... ............ .......... ... ......... ............. 39
3.6.5 Finite M ixture (FM ) m odel ............................................ ............... 40
3.7 Sum m ary ................................................................... .. ...... ........ 40

4 KNOWLEDGE-BASED INFORMATION RETRIEVAL MODEL
A R C H IT E C T U R E ..................................................................... ...... .....................42

4.1 SN OM ED ......................................... .. ........... ...............44
4.2 Anatomic Pathology Database (APDB) Design and Development...................46
4.2.1 Metadata Set Definition.............................................. 46
4.2.2 Information Processing: Retrieval and Extraction ....................................47
4 .3 Su m m ary ............................................................. .................................. 4 7

5 KNOWLEDGE-BASE MANAGEMENT ENGINE ...........................................49

5.1 Semantic Network Knowledge Base Model Representing SNOMED .............49
5.2 Classification of the Post-Coordinated Knowledge..............................52
5.2.1 Statistics of Pathology Patient Report Documents Space .......................52
5.2.2 Classification of Post-Coordinated Knowledge .............. ..............53
5.3 Statistical Model of the Post-Coordinated Knowledge ............. .....................56
5.4 Naive Bayes Model of Post-Coordinated Knowledge........................................56
5 .5 S u m m ary .................................................................... ................ 5 9

6 KNOWLEDGE CONVERSION ENGINE (KCE) ......................................... 61

6.1 Support Vector Machine Document Vector ................................................61
6.2 Conceptual D ocum ent V ector......................................... ......................... 62
6.3 KCE: Knowledge Reduction ...................................................... 63
6.4 KCE: Conversion of Pre-Coordinated Knowledge......................... ...............64
6.5 KCE: Generating the Conceptual Document Vector........................................65
6.6 KCE: Conversion of the Post-Coordinated Knowledge ......................................66
6.6.1 Statistical Model of Post-Coordinated Knowledge..............................66
6.6.2 Probabilistic Model of Post-Coordinated Knowledge..............................69
6.6 SVM IR Engine: Document Retrieval ................. ............... 71
6.7 Sum m ary ..................................... ................................ .......... 72









7 PERFORM ANCE EVALUATION .................................... ......................................73

7.1 Sim ulation Param eters ............................................................ ............... 73
7.2 Sim ulation R esult ........................ ......... .. ............ .... .... .... ....... ... ... ... ... 74
7.2.1 Performance Evaluation with Pre-Coordinated Knowledge ....................74
7.2.2 Performance Evaluation with Naive Bayes Post-Coordinated
K n ow led g e ..................................... .... ........... .......... ............... ...............7 8
7.2.3 Performance of Statistical Post-Coordinate Knowledge Model ..............80
7.3 Sum m ary ....................................................... ............. ......... 80

8 C O N C L U SIO N ......... ......................................................................... ........ .. ..... .. 82

8.1 C contributions ................. .................................. ................ ............. 82
8.2 Future W ork .................................................. ................. 84

APPENDIX

A PRIMARY TERMS WHICH ARE THE BASIS FOR THE DB ATTRIBUTE........ 86

B SN O M E D ST A T IST IC S ................................................................. .....................88

L IST O F R E FE R E N C E S ......... .. ................ ................................................................ 94

B IO G R A PH IC A L SK E TCH ..................................................................... ..................99
















LIST OF TABLES

Table page

5-1 Number of AP data each year from '83 to '94............... ....................................53

5-2 Number of unique SNOMED axes equations ................ .... .................53

5-3 R elation statistics am ong axes........................................... ........................... 54

5-4 Statistics on post-coordinated knowledge............................................ ..........55

7-1 Relevancy check result of 261 simulation documents .......................... ..........74

7-2 Value of performance gain of pre-coordinated knowledge compared to VSM .......78

7-3 Value of performance gain of post-coordinated knowledge ................. .......... 78

A Prim ary term s for A PD B ........................ ......... ............ .................. ............... 86

B -l P artial list o f T co d e ...................................................................... .................... 8 8

B -2 P partial list of M code ....................................................................... ................... 89

B -3 P partial list of E code ........................................................................ ...................90

B -4 P partial list ofF code ............................................ ................. .. ...... 91

B -5 Partial list of D code...................................................................... ............... 92

B -6 Partial list of P code ............................................ ................. .. ...... 93
















LIST OF FIGURES

Figure page

1-1 Knowledge-based information retrieval model .............................. ................4

2-1 Vector Space M odel example diagram ........................................... ............... 9

2-2 R ecall rate and precision ............................................................................ ..... 16

2-3 Relationship between recall and precision............. .............................................. 17

3-1 Example of the probability for combined evidence ..............................................30

3-2 Forward serial connection Bayesian network example................ .............. ....31

3-3 Diverging connection Bayesian network example............................................31

3-4 Converging connection Bayesian network example............................ ............31

3-5 E xam ple of chain rule ..................................................................... ...................33

3-6 E xam ple of N oisy-O R .............................................................................. ............34

3-7 General architecture of noisy-OR model .......................................... ............35

4-1 Architecture of the knowledge-based information retrieval model..........................42

4-2 Architecture of the knowledge-based information retrieval model detailed in the
example domain ................................. ........ .......... .. ............44

4-3 The "Equation" of SNOMED disease axes............................................................45

5-1 The three types of SNOM ED term relation .................................. ............... 49

5-2 SNOMED hierarchical term relationship .......................................... ............50

5-3 SNOM ED synonym s relationship................................................. ............... 51

5-4 SN OM ED M ultiaxial relationship ........................................ ....................... 51

5-5 Classification of post-coordinated knowledge ............................... ............... .55









5-6 An example of a four-axis-relation post-coordinated knowledge............................56

5-7 Structure of the post-coordinated knowledge in a Bayesian network...................57

5-8 PCKB component structure and probability estimation .......................................60

6-1 K now ledge reductions.............................................................................63

6-2 Attributes of the SNN-KB hierarchical topology relation........................................64

6-3 Example of Domain-Specific Knowledge relations..............................................67

6-4 Conversion of type-M relations................................ .......................... 68

6-5 Examples of case .. ........... .... .......................................................... 70

7-1 Perform ance evaluation m etrics ......................................................................... 73

7-2 Comparison of performance for queryl on positive cases .............. ...............75

7-3 Evaluation results of query 1 including the neutral cases. .....................................75

7-4 Evaluation results for query 2 for the positive cases.................... .................76

7-5 Evaluation results for query 2 including the neutral cases................... ..............76

7-6 Evaluation results of query 1 including post-coordinated knowledge ...................79

7-7 Evaluation results of query 2 including post-coordinated knowledge ...................79

7-8 Evaluation results of query 1 including statistical post-coordinated knowledge.....80

8-1 Knowledge reduction to statistical m odel ..................................... .................83

8-2 O ff-line application of know ledge ...........................................................................83















Abstract of Dissertation Presented to the Graduate School
of the University of Florida in Partial Fulfillment of the
Requirements for the Degree of Doctor of Philosophy

DOMAIN-SPECIFIC KNOWLEDGE-BASED
INFORMATION RETRIEVAL MODEL USING KNOWLEDGE REDUCTION
By

Changwoo Yoon

August 2005

Chair: Douglas D. Dankel II
Major Department: Computer and Information Science and Engineering

Information is a meaningful collection of data. Information retrieval (IR) is an

important tool for changing data into information. Of the three classical IR models

(Boolean, Support Vector Machine, and Probabilistic), the Support Vector Machine

(SVM) IR model is most widely used. But the SVM IR classical model does not convey

sufficient relevancy between a query and documents to produce effective results

reflecting knowledge except when using term frequency (tJ) and inverse document

frequency (idf).

Knowledge is organized information imbued by intelligence. To augment the IR

process with knowledge, several techniques have been proposed including query

expansion by using a thesaurus, a term relationship measurement like Latent Semantic

Indexing (LSI), and a probabilistic inference engine using Bayesian Networks.

We created an information retrieval model that incorporates domain-specific

knowledge to provide knowledgeable answers to users. We used a knowledge-based









model to represent domain-specific knowledge. Unlike other knowledge-based IR models,

our model converts domain-specific knowledge to a relationship of terms represented as

quantitative values, which gives improved efficiency.















CHAPTER 1
INTRODUCTION

The object of this thesis is creating an intelligent information retrieval model

producing effective results reflecting knowledge using a computationally efficient

method.

1.1 Background about Intelligent Information Retrieval

Conceptually, information retrieval (IR) is the process of changing data to

information. More technically, information retrieval is the process of determining the

relevant documents from a collection of documents, based on a query presented by the

user.

If we look at the World Wide Web (WWW) before any processing (e.g., search),

each document or web page is a datum. These data are un-interpreted signals or raw

observations that reach our senses. Providing meaning to these data allow them to

become information that is more meaningful and useful to humans than the raw data.

Information retrieval is the process that extracts information from data.

One of the well-known information retrieval models is Boolean search. In the

Boolean search model, we specify a set of query words that is compared to the words in

the documents to retrieve those documents precisely containing the given set of query

words. We can call the retrieved documents "information" but it is hard to call them

"knowledge," because additional tasks such as browsing each document and selecting the

more meaningful ones are required to transform the retrieved documents to some form of

knowledge. Knowledge is organized information.









The classic vector information retrieval model is an attempt to infuse knowledge to

information retrieval results using the frequency of the query terms that are found in the

documents. Intelligent information retrieval or semantic information retrieval attempts to

use some form of knowledge representation within the IR model to obtain more

organized information (i.e., improved precision, which is defined in Section 2.4) that is

knowledge. But it is difficult to codify or regulate the knowledge.

An ontology is the attempt to regulate knowledge and the specification of a

conceptualization (Gruber, 1993). In the artificial intelligence research fields, researchers

are using an ontology such as a knowledge representation or semantic web (Berners-Lee

et al., 2001), which is the abstract representation of data on the World Wide Web, in an

attempt to make the semantics of a body of knowledge more explicit.

We can classify an ontology as either general domain or closed domain. For

example, WordNet (Miller, 1990) is an example of a general ontology (consisting of a

thesaurus and a taxonomy) that aims to represent general-domain documents written in

natural language. We can compare closed-domain data to general-domain data.

* The subject of the closed-domain is confined. For example, a company offering
tourists information about excursions and outings might maintain this information
in a database. Such a database would consist exclusively of tour-related data.

* A closed-domain typically has its own knowledge repository such as a term
dictionary and relations that exist between terms. Good examples of such a
repository are the medical field's Unified Medical Language System (UMLS) and
Systematized Nomenclature of Medicine (SNOMED). We call these domain
specific knowledge.

The nature of closed-domain data allows us to use better semantics than that of general-

domain data.

Applying knowledge in the information retrieval process normally requires

significant computation. This computation occurs when the intelligent information









retrieval system tries to search the knowledge space during the retrieval process. From

this, we can derive the following set of research questions for closed-domain IR using

domain-specific knowledge:

* "How can we express effectively the domain specific knowledge as an ontology?"

* "What is the relationship between explicit semantics, ontology, and information
retrieval?"

* "How can we maximize the efficiency of IR using the given domain specific
knowledge (ontology)?"

1.2 Intelligent Information Retrieval Model

Our research aims to create an information retrieval model that incorporates

domain-specific knowledge to provide knowledge-infused answers to users. The closed-

domain data we used consists of pathology patient reports. Figure 1-1 is a conceptual

model of the proposed domain-specific knowledge-based information retrieval model.

Details of the model are given in Chapters 4, 5, and 6.

A classical vector space model (VSM) information retrieval system using term

frequency and inverse term frequency creates the query vector (1) and document vector

(2). The knowledge base management engine (KME) creates (5) the knowledge from the

existing documents set (3) before the system operation starts. The KME adds knowledge

(5) from new document (4) as they enter the database. The Knowledge Conversion

Engine (KCE) applies the knowledge (semantics) of the Knowledge Base (7) to the

Document Vector (6) to create the Conceptual Document Vector (8). The conventional

VSM IR engine calculates the relevance between the query vector (9) and the conceptual

document vector (10) resulting in a ranked document list (11).
































10
Ir *Ranked,
VSM IR engine Ra ked
SI11 Result

Figure 1-1. Knowledge-based information retrieval model

Using this model results in the following contributions to information retrieval

research:

* This information retrieval model is a knowledge-based IR model. Unlike other
models, that perform knowledge level information retrieval tasks such as ontology
comparison and ontological query expansion, this model reduces the knowledge
level represented by the knowledge base to the information level such as the vector
space model's document vector.

* Unlike other knowledge-based IR models, which have a heavy computation
requirement because they compare concepts between the IR model and the query
when the user requests information, this model uses the off-line application of
knowledge to the document vector leaving only a similarity measurement
calculation between the query and the documents.









* When a new document arrives in the system we modify the knowledge base with
only the knowledge that can be obtained and augmented from that new document,
not from the pre-defined knowledge base. We call this a dynamic feature of the
knowledge base. The dynamic feature of the knowledge base can be mapped to a
statistical feature by off-line knowledge conversion. This means that we apply the
changes of the document vector and the knowledge base in specified time intervals
not when introduced.

* This model can be applied to IR applications in the general domain if these
applications have a domain-specific knowledge ontology.

* Unlike other models, which have difficulty applying a knowledge hierarchy to the
IR model, the knowledge-based model uses a hierarchical term relevancy value to
express the knowledge hierarchy.

The organization of this thesis is as follows. Chapter 2 surveys the current research

efforts on information retrieval. Chapter 3 surveys the current research topics on

knowledge representation and inference using probability, concentrating on Bayesian

networks. Chapter 4 introduces the proposed information retrieval model for closed-

domain data. Chapter 5 and 6 discuss the details of the model. Chapter 7 presents a

performance evaluation of the model. The thesis concludes with Chapter 8, which


provides future research work to be completed.














CHAPTER 2
INFORMATION RETRIEVAL

2.1 Classical Information Retrieval Models

Information retrieval (IR) is a process that finds relevant documents (information)

from a document collection given a user's request (generally queries). In contrast to data

retrieval, which consists of determining which documents of a collection contain the

keywords in the user's query, an IR system is concerned with retrieving information

about a subject represented by the user's query.

There are three classic models in information retrieval: the Boolean, the vector, and

the probabilistic models (Yates and Neto, 1999, p. 21). The Boolean model is set

theoretic because documents and queries are represented as a set of index terms. The

vector model is algebraic because documents and queries are represented as vectors in a

t-dimensional space where t is the total number of index terms. In the probabilistic model,

probability theory forms the framework for modeling documents and query

representations.

2.1.1 Boolean Model

The Boolean model is a simple retrieval model based on set theory and Boolean

algebra (Yates and Neto, 1999, p. 25). In Boolean information retrieval, a query typically

consists of a Boolean expression, such as "(cat OR dog) AND NOT goldfish," and each

document is represented by the set of terms it contains. The execution of a query consists

of obtaining, for each term in the query, the set of documents containing this term. These

sets of retrieved documents are then combined using the usual set theoretic union (for OR









queries), intersection (for AND), or difference (for NOT) to obtain a final set of

documents that match the query. The Boolean model provides a framework that is easy to

understand by a common user of an IR system. Furthermore, the queries are specified as

Boolean expressions having precise semantics.

But, the Boolean model suffers from two major drawbacks. First, using the Boolean

model requires skilled users who can formulate quality Boolean queries. When the only

users of an IR system are librarians, for example, or computer scientists conversant in

logic, and the information to be searched is in a known or restricted form (such as

bibliographic records), a Boolean system is adequate. However, in cases where the users

are less skilled, or the information to be searched is less well-defined, a ranked strategy

(vector space, probabilistic, etc.) may be more effective. The Boolean model's second

drawback is that its retrieval strategy is based on a binary decision criterion (i.e., a

document is predicted to be either relevant or non-relevant) without any notion of a

grading scale, which prevents good retrieval performance. Thus, the Boolean model is in

reality much more a data retrieval model.

2.1.2 Vector Space Model

The vector space information retrieval model, first introduced by Salton et al.

(1975), takes a geometrical approach. A vector, called the "document vector," represents

each document. This vector is of identical length for all documents with the length

equaling the number of unique terms in the entire collection of documents.

Salton et al. (1975) defined the "term weight" (also known as the importance

1 eighi) as the ability of a term to differentiate one document having the term from other

documents having the same term.









A number of weighting schemes can be used in the vector space model. Salton uses

two properties: the term frequency and the inverse document frequency. The term

frequency (tJ) is the intra-document importance, which is the frequency of the term

occurring in a document. Term frequency measures how well that term describes the

document content. A term with a higher term frequency is more important than a term

with a lower frequency. The inverse document frequency (idj) is the number of

documents in the corpus which the term occurs. The inverse document frequency of term

j is calculated as


idf, =log N
n)

where N is the number of documents in the collection, and n, is the number of documents

in which termj occurs. The inverse document frequency is the inter-document

importance. If a term is uniformly present across the entire system, the term is less

capable of differentiating the documents, which means that it has less importance than a

term having a small global weight. We can calculate the term weight w,j of term i in

documents as

w ,j = tf x idf

where tf, is the term frequency of term i in documents, and idf, is the inverse document

frequency of term i in the entire set of documents.

After constructing the document and query vectors using the weighting scheme, we

calculate the similarity coefficient. One of the best known similarity coefficients is the









cosine measure (Salton, 1968), defined for the query vector q = (q,, q2,'" q,) and the

document vector d = (w,j, w2,j -,-,w,j ) where t is the number of terms:


q *d, q1 x w,,
sim(q, d) cos(q,) )=


The cosign similarity measures the angle between the query and document vectors in n-

dimensional Euclidean space.

Suppose that we have a query consisting of two terms and a set of documents that

may or may not contain those terms. Figure 2-1 illustrates the vector model and its

similarity measure between two documents, dl and d2, and query q which contain those

terms. The similarity between document 1 (di) and the query is Sl,q; while the similarity

between document 2 (d2) and the query is S2,q.

t2


wd2, --------------- d2

wd2,2 di
wd1,2 ----

wq2 --- -------- ----- q

2,q


wd1,2 wdl, wq1
Figure 2-1. Vector Space Model example diagram

2.1.3 Probabilistic Model

Probabilistic retrieval defines the degree of relevance of a document to a query in

terms of the probability that the document is relevant to the query. Maron and Kuhns,









(1960) first introduced the concept of probabilistic indexing in the context of a library

searching system. Robertson and Sparck-Jones (1976) introduced what is now known as

the binary independence retrieval (BIR) model, which is considered the standard model

of probabilistic retrieval.

The fundamental assumption of the probabilistic model is that the probabilistic

model estimates the probability of the relevancy of a document with a given user's query

q. If we state this as an equation, we can define the similarity of the jth document, dj, to

a query q as the ratio

P(R d )
sim(dj, q) = (2-1)
P(R | d )

where R is the set of documents known to be relevant, R is the set of non-relevant

documents, P(R I d ) is the probability that document d, is relevant to the query q, and


P(R I -) is the probability that dj is non-relevant to the query q.

The problem with Equation 2-1 (one disadvantage of the probabilistic model) is

that we must guess the initial value of the document relevancy. The first probabilistic

model, the BIR model, also did not consider the term frequency, which is a basic

assumption of the vector space model.

2.2 Alternative Information Retrieval Models

The classical information retrieval model does not consider the dependency among

the index terms. For examples, in the vector space model, all terms in the document

vector are orthogonal. The Latent Semantic Indexing (LSI) model (Furnas et al., 1988) is

one of the IR models that incorporates term dependency.









2.2.1 Latent Semantic Indexing (LSI)

The classical information retrieval models use index terms as querying tools. The

selection of the index terms is based on the assumption that the terms represent the

"user's need," that is they represent the concept of the user's query intention. But as the

search results show, index terms do not really contribute to the concepts of information

retrieval. For example, if the user wants to search about "Major cities in Florida," the

index terms used may be "Major," "city," and "Florida." The search engine may try to

find documents containing these keywords. But if the search engine is intelligent and

supports conceptual matching, it would try to search for keywords such as "Tampa,"

"Orlando," and "Miami" in the same way as human do.

The main idea of Latent Semantic Indexing (LSI) comes from the fact that a

document may contain words having similar concepts. So LSI considers documents that

have many words in common to be semantically close and vice versa (Furnas et al., 1988).

From the example in the previous paragraph, if the words "major," "city," "Florida,"

"Tampa," "Orlando," and "Miami" appears together in enough documents, the LSI

algorithm will conclude that those terms are semantically close, then return all documents

containing terms "Tampa," "Orlando," and "Miami" even though these latter terms are

not part of the given index terms.

The most important point of the LSI algorithm is that all calculations are performed

automatically by only looking at the document collection and index terms. As a result, the

problems of "Polysemy" and "Synonymy" can be addressed efficiently without the aid of

a thesaurus. Polysemy is the problem of a word having more than one meaning.

Synonymy is the problem that there are many ways of describing the same object.









LSI generally uses a statistical method called Singular Value Decomposition

(SVD) to uncover the word associations between documents. The effect of SVD is to

move words and documents that are closely associated nearer to one another in the

projected space. It is possible for an LSI based system to locate and use terms that do not

even appear in a document. Documents that are located in a similar part of the concept

space are retrieved, rather than only matching keywords.

2.2.2 Lateral Thinking in Information Retrieval

The human brain is divided into two halves: the left and right brain. The left-brain

excels at sequential thinking where the desired outcome is achieved by following a

logical sequence of actions. In contrast, the right brain is optimized for creativity where

the desired outcome may require a degree of non-linear processing.

Most information retrieval activity is focused on the requirements of sequential

thinking, which is most comfortable when searching with precision. An example of

sequential thinking in information retrieval is a Boolean logic search. When searching for

specific information, traditional techniques can be used to find documents containing the

required keywords combined with Boolean logic. "Sequential thinking," which is a process

of left-brain, is an analogous term to "vertical thinking."

Sometimes we are looking for information about a particular topic but the concept

is nebulous and difficult to articulate precisely. With this type of query it is difficult to

specify our search so that all of the best documents are found without too many irrelevant

ones. These difficulties are compounded if there is uncertainty about the presence of

documents, for example searches designed to gather evidence, or to prove the absence of,

information about the selected topic.









A successful outcome is likely to involve some right brain activity as we iterate the

process with carefully modified search criteria. This kind of brain activity is called

"lateral thinking" (Bono, 1973). The lateral thinking process is concerned with insight

and creativity. It is a probabilistic rather than a finite process.

In an information retrieval context, vertical thinking is used when we know

precisely for what we are looking and selecting the finite set of relevant documents is

relatively straightforward. In contrast, lateral thinking is applied where the requirements

are less well defined and the process of locating relevant information involves some

degree of trial and error. Unfortunately, traditional techniques, employed when searching

with precision, do not provide much assistance with this type of problem and the user is

left to try query after query until they have exhausted all permutations.

The ability to automatically identify multi-word concepts is absolutely fundamental

to provide some assistance to the right brain when searching unstructured information.

Without this ability the system is simply analyzing individual word frequencies that are

unlikely to make much sense to a human brain when taken out of context. Several

approaches (i.e., linguistics, artificial intelligence, and Bayesian networks) have

attempted to imbue concepts into the information retrieval model without much success.

Given that 90% of data is unstructured presents difficulties to the current statistical

information retrieval methods. If the data are well structured like in a relational database

schema, where a query is very specific, we can predict a precise result that is like vertical

thinking. Unfortunately, many people expect to search unstructured information in the

same way and are often disappointed when the documents they expect to find are not









returned. The problem is that unstructured data are highly variable in layout, terminology,

and style while the queries tend to be more difficult to define.

Yann et al. (2003) suggested using feedback from the user requests to retrieve

"alternative" documents that may not be returned by more conventional search engines,

in a way that may recall "lateral thinking" to solve heterogeneous large scale

pharmaceutical database problem (Yann et al. 2003). The proposed solution replaces the

query expansion phase by a query processing phase, where evolved modules are applied

to the query with two major results (Yann et al. 2003, p. 215):

* Rewritten queries will preferably retrieve documents that match fields of interest of
the user.

* Other documents related to previous and present queries will be retrieved, therefore
bringing some "lateral thinking" abilities to the search engine.

The system employs evolutionary algorithms, used interactively, to evolve a "user

profile" at each new query. This profile is a set of "modules" that perform basic rewriting

tasks on words of the query. The evaluation step is extremely simple: a list of documents

corresponding to the processed query is presented to the user. The documents actually

viewed by the users are considered as interesting, and the modules that retrieved the

document are rewarded accordingly. Modules that rarely or never contribute to the

retrieval of "interesting" documents are simply discarded and replaced by newly

generated modules. He used genetic programming technique to evolve the user profile

modules automatically.

2.3 Information Retrieval Models Involving Reasoning

A Bayesian network is a directed acyclic graph whose nodes represent random

variables and whose edges represent causal relationships between nodes. A causal

relationship means that if two nodes are connected, the parent node (i.e., the node from









which the edge comes) is considered to be a potential cause of the child node (i.e., the

node to which the edge points). We can consider the causal relationship as a probabilistic

dependency (Fung and Favero, 1995).

Lee et al. (2002) also proposed a Bayesian network model for a medical language

understanding system, which provides a noise-tolerant and context-sensitive character of

the system. He showed a relevant inference based on Bayesian network patterns.

Those information models performing inference based on Bayesian networks are

not yet at a mature stage and significant research is still needed in this area. This method

also has a problem with the heavy computational requirements needed to perform the

inference.

2.4 Evaluating Information Retrieval Performance

An evaluation of a system is usually performed before the release of the computer

system. Commonly, the measures of system's performance are time and space. For

example, in a data retrieval system like a database system, the response time and the

space requirement are the most interesting metrics. But in the information system, other

metrics are also interesting (Yates and Neto, 1999). This results from the vagueness of a

user's request to an information retrieval system. The retrieval results also produce partial

matches. The most common IR system, the vector space model, produces documents

ranked according to their relevance with the query. So the evaluation for information

retrieval should have a metric that evaluates how precise the answer of the IR system is.

The most commonly used metrics for relevancy evaluation of IR are recall and precision.

Consider a database where there are 100 documents related to the general field of

data extraction. A query on "text mining" may retrieve 400 documents. If only 40 of the

retrieved documents are about data extraction, the recall rate of the tested engine is 40%,









since the database contains 100 documents on data extraction (Schweitzer, 2003). Since

only 40 documents among 400 matched the request of the user, the precision rate of the

engine on this test is 10%. See Figure 2-2. If the desired set of returned documents (i.e.,

the target) is known, the recall rate is the proportion of returned documents that match the

target with respect to the total size of the target. The precision is the proportion of

relevant documents in the document set returned by the system.

All documents


-~ Retrieved \
40 400 .
/ "-,...... ... "

Rel Retrieved
Recall =
Relevant

Relevant Rel Retrieved
100 Precision =
Retrieved


Figure 2-2. Recall rate and precision

Trivially, if an algorithm always retrieves all documents in a document base, it has

one hundred percent recall. However, this retrieval has low precision because it is

unlikely that all documents match the query. In this sense, precision and recall have an

inverse relation shown in Figure 2-2. In many evaluations, precision is measured at a

fixed number of retrieved documents, e.g. "precision at 25," which gives a measure of

how well an algorithm delivers at the top of the retrieved list. In others, recall and

precision are plotted against each other: precision at a certain point of recall indicates

how many irrelevant documents readers must examine until they know they have found

at least half of the interesting documents. In the Text REtrieval Conference (TREC)









evaluations an "11-point" average measure is used, with precision measured at every 10

percent of recall: at 10 percent recall, at 20 percent recall, and so forth to 100 percent

recall, where all relevant documents are assumed to have been retrieved (Baeza and Neto,

1999, p. 76) The average precision at all those recall points is used as the total measure.



100




50





0
10 Recall 60
Figure 2-3. Relationship between recall and precision

Several methods help to maximize recall rates, for example, query expansion using

synonyms. Using this method, a search engine will also find documents on data

extraction provided that its thesaurus contains "data" as a synonym for "text" and

"extraction" as synonym for "mining." Significant research is currently being performed

on man-made thesauri to ensure that all documents that could match a query are actually

found (Foskett, 1997).

2.5 Useful Techniques

Other than the core information retrieval algorithm, there are a number of

techniques that are mandatory for IR processing such as document preprocessing,

stopword removal, and stemming. This section discusses several of these techniques that

might improve IR performance using text processing.









2.5.1 Stopword Removal

Stopwords are words that occur very frequently among documents in the collection.

In general, stopword do not carry any useful information. Articles, prepositions, and

conjunctions such as "in," "of," "the," etc., are natural candidates for a list of stopwords.

Stopword removal has often been shown to be effective at improving retrieval

effectiveness, even though many term weighting approaches are designed to give a lower

weight to terms appearing in many documents. It also has benefit on reducing the size of

the index term structure. Stopword removal is built into many IR engines.

In some situation, stopword removal causes reduced recall. For example, if the

user's query is "to be or not to be," the only index term left after stopword removal is

"be." As a result, some search engine do not adopt stopword removal. They use full text

indexing instead.

2.5.2 Stemming

Stemming is the process of removing affixes (i.e., prefixes and suffixes) and

allowing the retrieval of documents containing syntactic variations of query terms (Yates

and Neto, 1999, p. 165). This can involve, for instance, removing the final "s" from

plural nouns or converting verbs to their base form ("go" and "goes" both become "go,"

etc.). The most widely known stemming algorithm is the Porter algorithm (Porter, 1980),

which is built into many information retrieval engines.

The Porter algorithm uses a suffix list for suffix stripping. The algorithm has

several rules applicable to the suffix of words. For example, the rule



is used to convert plural forms into their singular forms by substituting the suffix letter

"s" to nil.









2.5.3 Passage Retrieval

Passage retrieval is the process of retrieving text in smaller units than complete

documents. The basic assumption of passage retrieval is that terms inside a meaningful

unit like a sentence have more meaning than across document. Callan (1994) describes

several approaches to passage identification, including paragraph recognition and

window based approaches, in which the position of the passage is determined by the

positions in the document of the terms matching the query.

In the classical information retrieval method, the order and distance of index terms

in the documents and the query have no meaning. If we use a word as an index term unit

and multiple closely located words combine to form a specific phrase, the order and

distance among the index terms can have a difference when compared with the unordered

terms.

2.5.4 Query Expansion

Whenever a user wants to retrieve a set of documents, he starts to construct a

concept about the topic of interest. Such a conceptualization is called the "information

need." Given an "information need," the user must formulate a query that is adequate for

the information retrieval system. Usually, the query is a collection of index terms, which

might be erroneous and improper initially. In this case, a reformulation of the query

should be done to obtain the desired result. The reformulation process is called query

expansion.

One of the simplest techniques involves the use of a thesaurus to find synonyms for

some or all of the terms in the query. These synonyms are added to the query to broaden

the search. The thesaurus used can be manually generated for a specific domain, such as

the medical domain. But for a general domain like the Web, it is hard to generate such a









knowledge base like thesauri because the documents from the general domain are

comparably new, large, and dynamically changing.

Various algorithms have been suggested for generating thesauri automatically. For

example, Crouch and Yang (2000) suggest a method based on clustering and term

discrimination value theory.

Another widely used method of query expansion is the use of relevance feedback.

This involves the user performing a preliminary search, then examining the documents

returned and deciding which are relevant. Finally, terms from these documents are added

to the query and the search is repeated. This obviously requires human intervention and,

as a result, is inappropriate in many situations. However, there is a similar approach,

sometimes called pseudo-relevance feedback, in which the top few documents from an

initial query are assumed relevant and are used for automatic feedback (Mitra et al. 1998).

2.5.5 Using Phrase

Many information retrieval systems are based on a vector space model (VSM) that

represents a document as a vector of index terms. The classical VSM uses a word as an

index term. To improve retrieval accuracy, it is natural to replace word stems with

concepts. For example, replacing word stems with a Unified Medical Language System

(UMLS) code if the document domain is medical is a possible way to include a concept

in information retrieval. However, previous research showed not only no improvements,

but a degradation in retrieval accuracy when concepts were used in document retrieval.

Replacing word stems with multiple word combinations was also studied. One

study used a phrase as an indexing term (Mao and Chu, 2002). A phrase is a string of

words used to represent a concept. The conceptual similarity and common word stems









jointly determine the correspondence between two phrases, which gains an increase in

retrieval accuracy when compared to the classical SVM model.

Separating the importance of weighting in SVM model has been suggested (Shuang

et al. 2004). Shuang et. al. considered phrases to have more importance than individual

terms in information retrieval. They used a tuple of two separate similarity measures

between the document and the query, (phrase-sim, term-sim), where phrase-sim is the

similarity obtained by matching the phrases of the query against the documents and term-

sim is usual a similarity measure used in the SVM model. Documents are ranked in

descending order of (phrase-sim, term-sim) where phrase-sim has a higher priority.

2.6 Enhancement of IR Through Given Knowledge

2.6.1 Using WordNet

WordNet is an electronic lexical database developed at Princeton University

beginning in 1985 (Miller, 1990). WordNet 2.0 has over 130,000 word forms. It is widely

used in natural language processing, artificial intelligence, and information technology

such as information retrieval, document classification, question-answer systems,

language generation, and machine translation.

The basic building blocks of WordNet are synonym sets ("synsets"), which are

unordered sets of distinct word forms and which correspond closely to what are called

"concepts." Examples of synsets are {car, automobile} or {shut, close}. WordNet 2.0

contains some 115,000 synsets.

There are two kinds of relations in WordNet: semantic and lexical relations.

Examples of semantic relations are "is-a," "part-of," "cause," etc. An "is-a" semantic

relation hierarchically organizes nouns and verbs from the top generic concepts to the

bottom specific concepts. Examples of lexical relations are synonymy and antonymy.









There have been several attempts to use WordNet for information retrieval (Chai

and Biermann, 1997). Query expansion is one of method that expands query terms having

similar meaning using a thesaurus like WordNet. This technique increases the chances of

retrieving more relevant documents. Several other research projects about query

expansion using WordNet have been performed (Voorhees, 1994), but the results are not

good: there is a small increase of recall but a degradation on precision. Rila et al. (1998)

concluded that the degradation of performance for IR using WordNet is caused by the

poorly defined structure of WordNet. It is impossible to find term relationships with

different parts of speech because words in WordNet are grouped based on part-of-speech.

Most of the relationships between two terms are not found in WordNet because WordNet

handles general lexical knowledge. Sanderson described most efforts in information

retrieval using WordNet and noted that a simple dictionary (or thesaurus) based word

sense representation has not been shown to greatly improve retrieval effectiveness

(Shaderson, 2000).

A recent study on word sense disambiguation in information retrieval using

WordNet (Kim et al. 2004) shows the possibility of improving IR performance using

WordNet knowledge. They proposed a root sense tagging approach. They noticed that the

tradition method described in the previous paragraph used afine-graineddisambiguation

for IR tasks. For example, the word "stock" has 17 different senses in WordNet, which

are used in word sense disambiguation. These include "act," "animal," "artifact,"

"attribute," "body," etc. Using these classifications when performing word sense

disambiguation, called coarse-graineddisambiguation, showed an improvement of

retrieval effectiveness.









2.6.2 Using UMLS, SNOMED

Medical language is extremely rich, varied, and difficult to comprehend and

standardize, and it has vagueness and imprecision. As a result, there have been many

efforts to make medical term dictionary structures such as the Unified Medical Language

System (UMLS) and Systematized Nomenclature of Medicine (SNOMED).

SNOMED is a hierarchically organized and systematized multiaxial nomenclature

of medical and scientific terms. We provide more detail on SNOMED in Chapter 3.

The terms in SNOMED and UMLS often require expert knowledge, so non-experts

like patients and lawyers cannot recognize the terms used. This problem motivates efforts

to combine WordNet and UMLS (Barry and Fellbaum 2004), since WordNet was not

built for domain specific applications, creating a need for a lexical database design

created specifically for the needs of natural-language processing in the medical domain.

This approach expands the synonyms thesaurus resulting in an information retrieval

query expansion. There are many efforts to visualize the concept of information.

Sometimes a figure is worth a thousand words (Pfitzner et al. 2003) with the use of a

picture facilitating a user's understanding of the presented information. Keynets

developed by Kenneth (http://ww.,, .,i, i, Jii ,",. kiL, key/fast/fast.html) is one of

information visualization techniques for representing information in a visual manner. To

extract meaning from technical documents, ontologies such as UMLS and semantic

frameworks like Keynets can be combined, which improve the accuracy and

expressiveness of natural language processing.

2.7 Summary

We described three classical information retrieval models: Boolean, Vector, and

Probabilistic. There are several attempts to augment knowledge in the information






24


retrieval process such as query expansion and using a phrase as a searching term. Our

attempts to incorporate knowledge in IR involve using a knowledge source directly as a

form of knowledge representation. Possible candidates for knowledge sources include

UMLS and SNOMED. Our developed model uses knowledge in the form of a semantic

network and a Bayesian network.

The next chapter explains the background required to understand the knowledge

base, especially the probabilistic Bayesian network model.














CHAPTER 3
KNOWLEDGE REPRESENTATION BY BAYESIAN NETWORK

As we will see, the knowledge in our experimental domain (pathology) consists of

two types. The first is pre-defined knowledge that can be used in describing data (i.e., a

patient's report). This type of knowledge can be expressed well using a semantic network.

The second type of knowledge is obtained from data that are not pre-defined. Normally,

experts describe this knowledge after analyzing the data. Errors will possibly intervene

during the writing and analyzing process, which means there is an uncertainty in the

knowledge. This type of data can be modeled well by a probability model, especially the

Bayesian network.

This chapter presents a discussion on knowledge representation issues,

concentrating on semantic networks and Bayesian networks, and surveys some of the

relevant literature.

3.1 Semantic Networks

Semantic networks are often used as a form of knowledge representation. They

were developed for representing knowledge within English sentences by representing

human memory's structure of having a large number of connections and associations

between the different pieces of information contained in it. Today, the term associative

networks is more widely used to describe these networks since they are used to represent

more than just semantic relations. They are widely used to represent physical and/or

causal associations between various concepts or objects.









A semantic network is a directed graph consisting of vertices that represent

concepts and edges that represent semantic relations between the concepts.

An important feature of any associative network is the associative links that

connect the various nodes within the network. It is this feature that makes associative

graphs different from simple directed graphs. Within knowledge-based systems,

associative networks are most commonly used to represent semantic associations. In the

more technically oriented applications, they can be used to express both the physical and

causal structure of systems.

The important semantic relations often used within a semantic network are:

* Meronymy (A is part of B),
* Holonymy (B has A as a part of itself),
* Hyponymy (or troponymy) (A is subordinate of B; A is kind of B),
* Hypernymy (A is superordinate of B),
* Synonymy (A denotes the same as B), and
* Antonymy (A denotes the opposite of B).

An example of a semantic network is WordNet, a lexical database of English. A

major problem of semantic networks is that although the name of this knowledge

representation contains the word "semantic," there is no clear semantics of the various

network representations. By representing the knowledge explicitly within an associative

network, a knowledge-based system obtains a higher level of understanding for the

actions, causes, and events that occur within a domain. The higher level of understanding

allows the system to reason more completely about problems that exist within the domain

and to develop better explanations in response to user queries (Gonzalez and Dankel

1988, p. 167).









3.2 Probability Principles and Calculus

This section provides the core principles necessary to understand Bayesian calculus,

which is the base model of the proposed knowledge base. This section starts with the

basics of probability calculus. Then, it introduces the concept of subjective probability

and conditional probability.

Probability is a method for articulating uncertainty. It also gives a quantitative

understanding of uncertainty providing a quantitative method for encoding likelihood.

Probabilistic methods and models give us the ability to attach numbers to the likelihood

of various results.

The standard view of probability is the frequentist view. This view says that

probability is really a statement of frequency. You can obtain a probability by watching

recurring events repeat over time. For example, the probability of a hurricane hitting

Florida during hurricane season can be determined by examining the historical record of

where hurricanes have struck the USA. In this view, probability is something that is

inherent in the process.

An alternative view of probability that is very useful to artificial intelligence

research is the subjective view, or Bayesian view. In the subjective view, probability is a

model of your degree of belief in some event. A Bayesian probability is the value or

belief of the person who assigns the probability (e.g., your degree of belief that a coin

will land heads), whereas a classical probability is based on the physical properties of the

world (e.g., the probability that a coin will land heads). In light of these statements, a

degree of belief in an event is referred to as a Bayesian or personal probability, while the

classical probability is referred as the true or physical probability of that event.









Probability is a logic and a language for talking about the likelihood of events. An

event, is a set of atomic events, which is a subset of the universe of all events. A

probability distribution is a function that maps events into the range of values between 0

and 1. Probability satisfies the following properties.

P(true) = 1 = p(Universe),
P(false) = 0 = P(0), and
P(A u B) = P(A) + P(B) P(A n B).

A random variable describes a probability distribution in which the atomic events

are the possible values that could be given to the variable. If we have multiple random

variables, we can talk about their joint distribution or the probability assignment to all

combinations of the values of the random variables. In general, the joint distribution

cannot be computed from the individual distribution. If we know all values of joint

distribution, we can answer any probability question. But if the domain is big, the

complexity grows exponentially.

We can introduce a concept of conditional probability.

P(A B)= P(Ar B)/P(B) (3-1)

This is the probability of A given B and states we are restricting our consideration

just to the part of the world in which B is true. We can derive Bayes' rule from the

definition of conditional probability.

P(A B) = P(B A)P(A)/P(B) (3-2)

To make this more concrete, consider the medical domain where we have diseases

and the symptoms associated with each disease:

P(disease symptom) = P(symptom disease) P(disease)/P(symptom).









The probability of a symptom given a disease is generally constant and does not

change according to the particular situation or patient. So it is easier, more useful, and

more generally applicable to learn these causal relationships.

So Bayes's rule has practical importance on conditional probability. We can use the

conditioning rule to obtain P(A).

P(A) = P(A B) P(B) + P(A B) P(~B)
P (A rB) + P(A r B)

We say A and B are independent, if and only if the probability that A and B are true

is the product of the individual probabilities of A and B being true.

P(A rB) P(A) P(B)
P(A B)= P(A)
P(B A)= P(B)

Independence is essential for efficient probabilistic reasoning. There is a more

general notion, which is called conditional independence. This states that A and B are

conditionally independent given C if and only if the probability of A given B and C is

equal to the probability of A given C.

P(A B, C)= P(A C)
P(B A,C) = P(B C)
P(A B C) = P(A C) P(B C)

We can solve the Bayesian network probability distribution using Bayes' rule and

conditional independency.

P(T, X IC)P(C)
P(C IT, X)=
P(T, X)

Assume T and X are conditionally independent given C.

P(C T, X) P(T I C)P(X C)P(C)
P(T, X)

















Figure 3-1. Example of the probability for combined evidence

We can obtain P(T,X) by the following equation.

P(C\T,X) +P(~C\T,X) = 1

P(T I C)P(X I C)P(C) P(T I~ C)P(X I~ C)P(~ C) 1
+
P(T, X) P(T, X)

P(T I C)P(X I C)P(C) + P(T I~ C)P(X I~ C)P(~ C) = P(T, X)

3.3 Bayesian network

A Bayesian network is an efficient factorization of the joint probability

distributions over a set of variables. If we want to know everything in the domain, we

need to know the joint probability distribution over all those variables. If the domain is

complicated, with many different prepositional variables, the solution is infeasible. For

example, if you have N binary variables, then there are 2" possible assignments, and the

joint probability distribution requires a number for each one of those possible

assignments.

The intuition of Bayesian network is that there is almost always some separability

between the variables (i.e. some independence), so that we do not actually have to know

all of those 2" numbers to know what is occurring in the world. Bayesian networks have

two components. The first component is called the "causal component." It describes the

structure of the domain in terms of the dependencies between variables, and the second

part is the actual numbers, the quantitative part.









There are three connection types in Bayesian networks. First is the forward serial

connection shown in Figure 3-2. Evidence is transmitted from A to C through B unless B

is instantiated (i.e., its truth value is known). The evidence propagates backward through

the serial links as long as the intermediate node is not instantiated. If the intermediate

node is instantiated, then evidence does not propagate.



A B C
Figure 3-2. Forward serial connection Bayesian network example


A C



Figure 3-3. Diverging connection Bayesian network example







Figure 3-4. Converging connection Bayesian network example

The second connection type is the diverging connection shown in Figure 3-3. In a

diverging connection, there are arrows going from B to A and from B to C. IfB is not

instantiated, the evidence of A propagates through to C. But if B is instantiated, the

propagation is blocked.

The tricky case is when we have a converging connection like Figure 3-4. A points

to B and C points to B. Let us first think about the case when neither B nor any of its

descendants is instantiated. In that case, evidence does not propagate from A to C. For

example, suppose B is "sore throat," A is "Bacterial infection," and C is "Viral









Infection." If we find that someone has a bacterial infection, it gives us information about

whether they have a sore throat, but it does not affect the probability that they have a viral

infection also. But when either node B is instantiated, or one of its descendents is, we

know something about whether B is true. And in that case, information does propagate

through from A to C.

If two variables are d-separated, then changing the uncertainty on one does not

change the uncertainty on the other. Two variables a and b are "d-separated" if and only

if for every path between them, there is an intermediate variable V such that either: the

connection is (serial or diverging) and v is known; or the connection is converging and

neither v nor any descendent has evidence. For example, if the connection ABC is serial,

it is blocked when B is known and connected otherwise. When it is connected,

information can flow from A to C or from C to A.

Bayesian networks are sometimes called belief networks or Bayesian belief

networks. A Bayes net consists of three components: a finite set of variables, each of

which has a finite domain, a set of directed arcs between the nodes, forming an acyclic

graph; and every node A, with parents B1 through Bn has a conditional probability

distribution, P(A Bl... Bn) specified. The crucial theorem about Bayesian networks is that

ifA and B are d-separated given some evidence e, then A and B are conditionally

independent given e; that is, then P(A B,e) = P(A e). We can exploit these conditional

independence relationships to make inference efficient.

The chain rule results from the conditional independence relationship of Bayesian

networks. Let us assume there are n Boolean variables: Vl,..., Vn.. The joint probability








distribution is the product of all the individual probability distribution that are stored in

the nodes of the graph.

P(V1 = v, V2 = v2,...,V, = vn) = n,P(V, = vi I parents(V,)) (3-3)



P(A) A B P(B)


C P(CIA,B)



D P(DIC)
Figure 3-5. Example of chain rule

If we compute the probability that A, B, C, and D are all true, we can use

conditioning to write that.

P(ABCD) = P(D ABC)P(ABC)

We can simplify P(D ABC) to P(D C), because given C, D is d-separated from A

and B. And we have P(D C) stored directly in a local probability table, so we are done

with this term. Now we can use conditioning to write P(ABC) as P(C AB) times P(AB).

These can be changed by d-separation.

P(ABC) P(C AB)P(AB)
SP(C AB) P(A) P(B)

For each variable, we just have to condition on its parents. Then, we multiply the

results together to obtain the joint probability distribution. This means that if you have

any independence (if you have anything other than all the arrows in your graph in some

sense), then you have to do less work to compute the joint distribution.

3.4 Noisy-OR: Bayesian network inference









Imagine that there are three possible causes for having a fever: flu, cold, and

malaria. The network of Figure 3-6 encodes the fact that flu, cold, and malaria are

mutually independent of one another.


SFlu Cold alari



Fever
Figure 3-6. Example of Noisy-OR

In general, the conditional probability table for fever will have to specify the

probability of fever for all possible combinations of values of flu, cold, and malaria. This

is a large table, and it is hard to assess. Physicians, for example, probably do not think

very well about combinations of diseases. It is more natural to ask them individual

conditional probabilities: what is the probability that someone has a fever if they have the

flu? We are essentially ignoring the influence of cold and Malaria while we think about

the flu. The same goes for the other conditional probabilities. We can ask about

P(feverlcold) and P(fever|malaria) separately. We are assuming that the causes act

independently, which reduces the set of numbers that we need to acquire. If the patient

has flu, and the connection is on, then he will certainly have fever. Thus it is sufficient for

one connection to be made from a positive variable into fever from any of its causes. If

none of the causes are true, then the probability of fever is assumed to be zero (though it

is always possible to add an extra cause that is always true, but which has a weak

connection, to model the possibility of getting a fever "for no reason").

Here is the general formula for a noisy OR. Assume we know P(effect cause) for

each possible cause. And, we are given a set, Cr, of causes that are true for a particular









case. Then to compute the probability of E given C, we compute the probability of not E

given C.

P(E\C) 1 P(-E\C) (4)

That is equal to the probability of not E just given the causes that are true in this

case, CT. And because of the assumption that the causes operate independently (that is,

whether one is in effect is independent of whether another is in effect), we can take the

product over the causes of the probability of the effect being absent given the cause.



C, C2 c C3




Effect
Figure 3-7. General architecture of noisy-OR model

Finally, we can easily convert the probabilities of not E given C, into minus

probability of E given C.

P(E C)

= -P(-E C)
=1 P( E IC)
= fP(~ E C,) (3-5)
C,e C
= 1- (1- P(E I C))
CzCT

3.5 QMR-DT model

The QMR-DT model is a two-level or bi-partite Bayesian network intended for use

as a diagnostic aid in the domain of internal medicine. We provide a brief overview of the

QMR-DT model here; for further details see Shwe and Cooper (1991).









The QMR-DT model is a bipartite graphical model in which the upper layer of

nodes represents diseases and the lower layer of nodes represent symptoms. There are

approximately 600 disease nodes and 4000 symptom nodes in the database proposed by

Shwe and Cooper (1991).

The evidence is a set of observed symptoms, which is referred as "findings." We

use the symbolfto represent the vector of findings. The symbol d denotes the vector of

diseases. All nodes are binary, thus the components and d, are binary random variables.

The diseases and findings occupy the nodes on the two levels of the network,

respectively, and the conditional probabilities specifying the dependencies between the

levels are assumed to be noisy-OR gates (Pearl 1988). There are a number of simplifying

assumptions in this model. In the absence of findings, the diseases appear independent

from each other with their respective prior probabilities (i.e., marginal independence),

although some diseases probably do depend on other diseases. Second, the findings are

conditionally independent given the diseases.

The probability model implied by the QMR-DT belief network can be written by

the joint probability of diseases and finding as


P(f, d) = P(f d)P(d) = IP(f, d) P(d) (3-6)


where d andf are binary (1/0) vectors referring to the presence/absence states of the

diseases and the positive/negative states or outcomes of the findings, respectively. The

prior probability of the diseases, P(d,), were obtained by Shwe et al. from archival data.

The conditional probabilities, P(f I d) for the findings given the states of the diseases,

were obtained from expert assessments and are assumed to be noisy-OR models:









P(f = l0d) =P(f = 0|L) nP(f = ld,) (3-7)
J pa,

= (1- q ) n (1- q,)d, (3-8)


where pa, (parents of i) is the set of diseases pertaining to finding f .

q, = P(f = 0 | d = 1) is the probability that the disease, if present, could alone cause

the finding t, have a positive outcome, and q,, = P(f =0 L) is the "leak" probability,

i.e., the probability that the finding is caused by means other than the diseases included in

the belief network model. The effect of each additional disease, if present, is to contribute

an additional factor of (1 q, ) to the probability that the ith finding is absent.

3.6 Bayesian Classifiers

In this section, we introduce some of the classifiers of the form of Bayesian

network that can be used in the modeling of medical diagnosis. We can define the

classification problem as a function assigning labels to observations (Miquelez et al. 2004,

p. 340). If there is a vector x =(xl,..., x,) e 9" and classes of variable C, we can regard

the classifier as a function y: (x,,..., x) -> {1,2,...,| C I} that assigns labels to

observations. This can be rewritten to obtain the highest posterior probability, i.e.

7(x) = arg max p(c I x,,..., x) .
C

We can use the Bayesian classifier in medical diagnostics to find the probable

disease from the given symptoms.

We will use the notation O meaning outcome for class variable C, and F meaning

finding for the observed variables for the explanation in the following chapters. We use

capital letters for variable name and small letters for the values.









3.6.1 Naive Bayes

The concept that combines the Bayes theorem and the conditional independence

hypothesis is proposed by several names: idiot Bayes (Ohmann et al. 1988), naive Bayes

(Kononenko, 1990), simple Bayes (Gammerman and Thatcher 1991), or independent

Bayes (Todd and Stamper 1994). The naive Bayes (NB) approach (Minsky, 1961) is the

simplest form of classifier based on Bayesian networks. The outcome variable O is

defined as the commonparent of the findings, F = {F,,..., }, and each of the findings

F, is a child of the outcome variable O. The shape of network is always same: all

variables F,,..., F are considered to be conditionally independent given the value of the

outcome variable O, which is a main assumption of NB.

This is a conditional probability model. We can calculate the posterior probability

using Bayes rule and conditional independence.

P(()P(FO o) | P(O))P( o)
P(O I F,,...,)=Fn nP(0) P(F\)
P(FI,,...,Fn) 1,

The main advantage of this approach is that the structure is always fixed and simple

to calculate because the order of dependence to be found is fixed and reduces to two

variables. The number of conditional probability distribution p(O I F,) would result in a

considerable reduction in the number of parameters necessary. The Naive Bayes model

only requires 2n+ 1 parameters, where n is the number of parents of F,, whereas the joint

probability requires 2" parameters. But there is no relationship between findings that is

not realistic in the real world. There is extensive literature showing even these kinds of

simple computational models can perform surprisingly well (Domingos and Pazzani

1997) and are able to obtain results comparable to other more complex classifiers.









3.6.2 Selective Naive Bayes

The selective naive Bayes is a subtly different model compared to the naive Bayes

with the selective feature of findings. In the selective naive model, not all variables have

to be present in the final model (Kohavi and John 1997; Langley and Sage 1994).

There is a restriction that all variables must appear in the naive Bayes model for

some types of classification problems, but some variables could be irrelevant or

redundant for classification purposes. It is known (Liu and Motoda 1998; Inza et al.

2000) that the naive Bayes paradigm degrades with some cases, so the motivation of

removing variables is modeled in the selective naive Bayes (Miquelez et al. 2004, p. 340).

3.6.3 Seminaive Bayes

The intuition in the seminaive Bayes model is that we can combine variables (i.e.,

findings) together (Kononenko, 1991). It allows groups of variables to be considered as a

single node in the Bayesian network, aiming to avoid the strict premises of the naive

Bayes paradigm.

3.6.4 Tree Augmented Naive Bayes

In the tree augmented naive Bayes, (Friedman et al. 1997) the dependencies

between variables other than C are taken into account. The model represents the

relationships between the variables, X1,..., X,, conditional on the class variable C by

using a tree structure. The tree augumented naive Bayes structure is built using a two-

phase procedure. Firstly, the dependencies between the different variables X,,..., X, are

learned. This algorithm uses a score based on information theory, and the weight of a

branch (X,, X ) on a given Bayesian network S is defined by the mutual information

measure conditional on the class variable as









I(X,,X, C)


= ZP(c)I(X,,X C = c)

P(xx, xo I c)
= I IP(x,,xJ,,c)log
Sx, xJ P(x, I c)P(xj I c)

With these conditional mutual information values the algorithm builds a tree

structure. In the second phase, the structure is augmented into the naive Bayes paradigm.

3.6.5 Finite Mixture (FM) model

The finite mixture (FM) model tries to relax the conditional independence

assumption in the Naive Bayes model (Cheeseman and Stutz 1996). In a FM model, all

the dependencies between observed variables, both the findings and outcome variable,

are assumed to be modeled by a single discrete latent (i.e., unobserved) variable (Monti

and Cooper 1998, p. 593). In a FM model the outcome variable is itself a child node, and

the common parent is a latent variable.

3.7 Summary

We described two knowledge representation models: semantic networks and

Bayesian networks. There are attempts to model medical diagnosis using probabilistic

Bayesian models. Shwe's QMR-DT model is a two-level noisy-OR model using disease

and symptoms nodes, where the nodes in the same layer are independent. The QMR-DT

model uses several assumptions to reduce the complexity of the joint probability

distribution calculation, but it shows exponential complexity time when implemented as

an algorithm. There were several attempts to use Bayesian classifiers in a medical

diagnosis model: naive Bayes, selective naive Bayes, seminaive Bayes, tree augmented

naive Bayes, finite mixture model, and finite mixture augmented naive Bayes. Unlike the

other model's modeling of dependency among findings, naive Bayes assumes conditional






41


independency among the findings. But even the simplicity of the modeling, naive Bayes

shows good performance when compared to other complex models.

The next chapter explains the overall architecture of Knowledge-based Information

Retrieval (KBIR) model that uses semantic networks and naive Bayes as a knowledge

model.













CHAPTER 4
KNOWLEDGE BASED INFORMATION RETRIEVAL MODEL ARCHITECTURE

This research developed a knowledge base information retrieval model for a closed

domain. Figure 4-1 is the architecture of the model.

5a ----
New Document

Query Documents 5b

la 2a-- 35
i- I Knowledge base
Query Document I management engi
Vector Vector / s 3b


__2b uKnowledge
Knowledge Base
Conversion Engine 3d
lb I 4a


Document
Vector

1c
c .......Ranke......
VSM IR engine Rake
Id Result

Figure 4-1. Architecture of the knowledge-based information retrieval model

The overall operation of model is as follows. A classical vector space model

(VSM) information retrieval model using term frequency and inverse term frequency

creates a query vector (la) and a document vector (2a). The knowledge base management

engine (KME) creates (3b) knowledge from the set of existing documents (3a) before the









system operation starts. The KME processes and adds knowledge from any new

documents (5b) added to the document space. The Knowledge Conversion Engine (KCE)

applies the knowledge (semantics) of the Knowledge Base to the Document Vector (2b,

3d) to create the Conceptual Document Vector (4a). The conventional VSM IR engine

calculates the relevance between the query vector and the conceptual document vector

(Ib, Ic) resulting in a ranked document list (Id).

To illustrate proof of concept, we implement this model in the domain of pathology.

Figure 4-2 is a detailed architecture of the resulting model. The edges of this diagram

represent procedures or actions taken in processing the nodes, which represent data or

subsystems. Among the procedures shown by the edges, the bold edge processes (la, lb,

Ic) are on-line processes, while edges shown with normal lines are off-line processes

completed before the start of any user's query processing. For this domain the knowledge

base is named the SNOMED Semantic Network Knowledge Base (SNN-KB). The SNN-

KB is part of the KME developed from the off-line processing (4a) of SNOMED.

The documents used in the pathology domain are pathology reports called

Anatomic Pathology (AP). Because we preprocessed AP raw text data into a database,

the actual data from the documents used in this system are contained in the Anatomic

Pathology Database (APDB). The Document Vector is produced (2b) from the APDB,

and the KME creates (2a) the dynamic parts of the SNN-KB. When a new document is

added (3a), the KME modifies (3b) the Document Vector and the SNN-KB. The

Knowledge Conversion Engine (KCE) initially makes the Conceptual Document Vector

(5c) from the Document Vector and the KME's SNN-KB (5a, 5b). Periodically the KCE









updates the Conceptual Document Vector (CDV) to reduce the computational needs

rather than updating the CDV every time a new document is added.

Nc\\ DocumentIc
Quer\ Documents 2a
Q- N APDB 3a
la
2b
Query
Vetor Document 3b iMaiagemenleiit inTi
Vectoror / ..........
Vector .......
......- ... 11 SNN-KB \ ;.
/b5a .
lb
Kno\ cl% c 5b 4a
i. Col\e 'sion E i i t tl 2i


Conceptual
Document
Vector

I Id Ranked
S VSM IR engine Re
Result

Figure 4-2. Architecture of the knowledge-based information retrieval model detailed in
the example domain

Before we describe the Knowledge Base Management Engine, we describe

SNOMED and the characteristics and pre-processing of the example data: the Anatomic

Pathology Database (APDB).

4.1. SNOMED

Surgical Pathology, cytology, and autopsy reports are highly structured documents

describing specimens, their diagnoses, and retrieval and charge specification codes. The

Systematized Nomenclature of Medicine (SNOMED) developed by the College of

American Pathologists is used for a retrieval code. This was developed in collaboration

with multiple professional societies, scientists, physicians, and computer consultants

[Systematized, 1979]. SNOMED II is a hierarchically organized and systematized









multiaxial nomenclature of medical and scientific terms. There are six main axes based

on the nature of man. These begin with a hierarchical listing of anatomical systems,

known as the Topography (or T) axis. Any change in form of topography structures

throughout life is characterized in the Morphology (or M) axis. Causes or etiologies for

those changes are listed in the Etiology (or E) axis. All human functions, normal and

abnormal, are listed in the Function (or F) axis. Combinations of Topography,

Morphology, Etiology, and Function may constitute a disease entity or syndrome and are

classified in the Disease (or D) axis. Using the T, M, E, F, and D axes it is possible to

code nearly all-anatomic and physiologic features of a disease process as shown by the

example in Figure 4-3.

T + M + E + F = D
Lung + Granuloma + M.tuberculosis + fever = Tuberculosis

Figure 4-3. The "Equation" of SNOMED disease axes


There is another field that is not part of the disease equation: a Procedure field,

classified in the Procedure (or P) axis, which allows identification of services or actions

performed on behalf of the patient with the problem.

Pathology reports typically consist of useful, apt, and concrete terms in sentence or

template format. The diagnostic terminology in reports and SNOMED involve standard

terms and acceptable synonyms, both have the same SNOMED code number (e.g.,

Pneumonia and pneumonitis are coded T28000 M40000 or lung + inflammation).

Pathology reports usually contain a specific field for SNOMED codes. Certain anatomic

pathology computer systems include SNOMED files that allow code selection, but

automated encoding programs are uncommon. Pre-coded synoptic templates of

diagnostic terms allow consistency for diagnostic encoding, but many diagnostic









statements contain descriptive language, semantic forms, and linguistic nuances that

make automated coding difficult. There is a continual need for error checking.

4.2 Anatomic Pathology Database (APDB) Design and Development

Two important characteristics of the APDB patient records are their fixed data and

closed domain. The system's target data are patient records from 1980 to the present,

which we consider as fixed or static, meaning that any dynamic features of the system is

minimized. The nomenclature used in a patient report is restricted to the domain of

anatomic pathology and related areas of medicine, making it a relatively closed domain.

These features provide a good environment and structure for constructing a knowledge

base.

Among the several forms of knowledge representation commonly used, the

semantic network is widely used for representing simple hierarchical structures. Because

SNOMED has a hierarchical architecture, we adopted the semantic network for the

knowledge representation method.

4.2.1 Metadata Set Definition

Appendix A shows the metadata set definition used to parse the patient surgical

pathology records. There are 25 terms that must be located and separated in the current

patient record. These terms serve as attributes in the database table. Because some term

names have changed through the years, several synonyms exist for some terms. For

example, "SURGICAL PATH NO," "ACC#," and "CYTOLOGY NO" have the same

meaning: the sequential number of the patient record in the set.

The parser, a batch program, processes the patient record and creates an output file

containing separate patient record fields. The Database (DB) loader reads the output

generated by the parser then stores the results to the DB. The parser also generates an









index file that has proximity information among the words inside the gross description

and diagnosis. This can be used in multiple keyword information searches. The proximity

information is needed to rank the relevant results.

4.2.2 Information Processing: Retrieval and Extraction

There are several distinct advantages in processing the pathology patient data. First,

the patient record data from 1982 to the present are unique to the University of Florida.

This reflects a unique character, both regionally and periodically. Thus, when the parsing

is finished, the analysis of the frequency of words and multiple word terms has

significant meaning. Second, because the patient reports are expressed in standard

medical language (which varies slightly from physician to physician), the terms used are

sometimes not an exact match to the SNOMED terms. This makes it useful to analyze the

patient reports based on the SNOMED terms. Patient reports also have a
Codes> field that shows matching SNOMED codes with the . The analysis

of the SNOMED code frequency throughout the patient records can give a valuable

research sense to the pathologist. These types of analysis can be done statically and can

be reported all at once.

While this static analysis is extremely useful, most information processing should

be done dynamically. We cannot imagine or anticipate all requests that might be made of

this knowledge base. So for information retrieval purposes, the terms in the documents

were analyzed. This provided the relation between the documents and the terms in the

form of a proximity value.

4.3 Summary

We showed the architecture of the developed knowledge based information

retrieval model. The model shows well-separated sections of on-line and off-line









calculation to provide efficiency in the calculations during the document retrieval process.

The knowledge reduction technology enables the off-line adaptation of knowledge, which

is a distinct modeling concept compared to other knowledge-base models incorporating

knowledge processing in their retrieval process. We talked about the experimental

domain: pathology and SNOMED.

In the next chapter, we describe the details of this model. First, we describe the

Knowledge Base Management Engine (KME) and a knowledge base structure that

contains the domain specific knowledge in Chapter 5. Second, we provide details on the

Knowledge Conversion Engine (KCE) in Chapter 6. There, we describe the query vector,

the document vector, and the conceptual document vector. The VSM IR engine uses the

same methods as the conventional vector model's query- a document relevancy

calculation method.














CHAPTER 5
KNOWLEDGE BASE MANAGEMENT ENGINE

The knowledge base for this KBIR system is the Systematized Nomenclature of

Medicine (SNOMED). In this chapter we discuss the SNOMED based knowledge model,

which consists ofpre-coordinated knowledge and post-coordinated knowledge. Thepre-

coordinated knowledge is knowledge described in SNOMED that is coded by a

Pathologist. We can say that this knowledge is the expert knowledge that the Pathologist

used in writing and understanding a patient's report. The post-coordinated knowledge is a

special form of knowledge that can be obtained from a patient's report. This is

augmentable knowledge that can be found from the introduction of new data. The

knowledge base uses the constructed model in the information retrieval process.

5.1 Semantic Network Knowledge Base Model Representing SNOMED

SNOMED is a detailed and specific coded vocabulary of names and descriptions

used in healthcare. It is explicitly designed for use in computerized patient records. We

can classify the term-to-term relationships, which are called the "pre-coordinated

relationships in SNOMED as one of three types. See Figure 5-1.


SHierarchical Topology (has-a)


Synonymy (is-a)

---------------- Multi-axial relation
Figure 5-1. The three types of SNOMED term relation









The first type is a hierarchical topology. The SNOMED terms are all arranged in a

hierarchy, represented by an alphanumeric code where each digit represents a specific

location in the hierarchy. Figure 5-2 illustrates the hierarchical structure of this

knowledge modeled as a semantic network. Arcs expressing the "part of' or "has-a"

relation connect the nodes of this network. Moving from a lower level concept to a higher

level is generalization, while moving in the opposite direction is specialization.

SGeneralization
T28000
Lung



T28100 T28500
RightLun Left Lun Hierarchy





T28110 T28120 ...
ht Lung,ae iht Lung, b Specialization
Figure 5-2. SNOMED hierarchical term relationship

SNOMED has controlled vocabulary characteristics. A controlled vocabulary

allows individuals to record data in a patient's record using a variety of synonyms, where

each references a primary concept. For example, in SNOMED, the following terms are

classified as symptoms of increased body temperature: FEVER, PYREXIA,

HYPOTHERMIA, and FEBRILE. Each carries the same term code. Figure 5-3 illustrates

another example using the semantic network form. We call the relationship of synonyms

an "is-a" relationship. The synonym relation is explicit each other. There is no

propagation among the nodes.






















S D0110
Bacterial
sepsis

Figure 5-3. SNOMED synonyms relationship


E6921
Fava bean


D4094
SFavism








Figure 5-4. SNOMED Multiaxial relationship

The third relationship of SNOMED terms is a multi-axial relation shown in Figure

5-4, which refers to the ability of the ordered set of names to express the meaning of a

concept across several axes. We can find examples of this relationship over all axes with

it most apparent in the disease axis. The SNOMED D code representing "Tuberculosis"

has an information link to the T code representing "Lung." This relationship is pre-coded,

mirroring the knowledge encoded at the time of SNOMED's standardization.









5.2 Classification of the Post-Coordinated Knowledge

The domain-specific knowledge of our model handles only multi-axial

relationships among the three types of SNOMED relations. This relationship is most

apparent in the disease axis with a series of codes from other axes of SNOMED

comprising the essential characteristics of a disease.

As detailed in Section 4.1, SNOMED consists of six categories: Topography,

Morphology, Etiology, Function, Disease, and Procedures. A patient report has
codes> terms showing matching SNOMED categories and numbers. It is possible to code

most of the anatomic and physiologic elements of a disease process, both normal and

abnormal, with the combination of the five axes. These elements are often used to

summarize a codable class of disease or a recognized syndrome, basically what is called

the SNOMED equation shown in Figure 4-3.

Some of the relations are straightforward but often cases have unique relationships

based on the patient's report. It is possible to develop a unique knowledge base using

these relationships. We can find statistics within the pathology document space that form

the basis of the post-coordinated knowledge, then we classify the extracted post-

coordinated knowledge.

5.2.1 Statistics of Pathology Patient Report Documents Space

We examined Anatomic Pathology (AP) data sets from 1983 to 1994. There are a

total of 290,346 data sets. Table 5-1 shows the number of data each year.

From the data set, we extracted the SNOMED codes from each documents. The

SNOMED codes represent the semantics of each document. Table 5-2 identifies the

number of unique SNOMED axes. Appendix B is a partial list of unique SNOMED codes

found in the patient reports.









Table 5-1. Number of AP data each year from '83 to '94
Year Number of sets
1983 17,351
1984 23,186
1985 22,781
1986 22,928
1987 22,965
1988 26,663
1989 27,486
1990 27,814
1991 25,497
1992 23,755
1993 24303
1994 25635
Total 290,346


Table 5-2. Number of unique SNOMED axes equations
Axis Number of unique Total occurrence
occurrence
T 3,759 702,942
M 4,460 594,870
E 315 137,057
F 413 44,278
D 771 11,001
P 637 348,716
Total 10,355 1,838,864


Table5-3 is the number of distinct relations between axes. From the statistical data,

we can calculate the base prior probability of the naive Bayes based post-coordinate

knowledge structure that is explained in Section 5.3.

5.2.2 Classification of Post-Coordinated Knowledge

From the SNOMED codes of each document, we can extract post-coordinated

knowledge. Because of the uncertainty of the world, the pathologist does not know or

describe the SNOMED equation exactly. This means there will be a partial description of

knowledge. We only count the description of SNOMED code as post-coordinated

knowledge if they contain the "D" axis. If the pathologist described SNOMED code









including "D", there is acceptable certainty of that a SNOMED equation exists. Figure 5-

5 shows the four kinds of SNOMED equations found in documents space.

Table 5-3. Relation statistics among axes
Axis Number of Related Number of
unique axis two-axis
relations relations
T 48170 M 34354
E 979
F 972
D 1515
P 10299
M 75190 T 34354
E 1160
F 1268
D 1684
P 12480
E 2999 T 979
M 1160
F 57
D 75
P 527
F 3229 T 972
M 1268
E 57
D 190
P 486
D 4706 T 1515
M 1684
E 75
F 190
P 1067


Table 5-4 shows the amount of post-coordinated knowledge found in the document

space. We use this knowledge to induce possible diseases from incomplete SNOMED

equations (i.e., equations lacking a disease axis).








D

T
(a) Two axis relation


D D

T ET F
(b) Three axis relationships


D D


(c) Four axisTEF TMEionships
(c) Four axis relationships


D

TMF


D

TME F
(d) Five axis relationship

Figure 5-5. Classification of post-coordinated knowledge
Table 5-4. Statistics on post-coordinated knowledge
Post-coordinated Number of unique relations
knowledge relations
D-T 568
D-T-E 26
D-T-F 38
D-T-M 7,425
D-T-E-F 3
D-T-M-E 305
D-T-M-F 534
D-T-M-E-F 68









5.3 Statistical Model of the Post-Coordinated Knowledge

Figure 5-6 shows an example of a four-axes-relation post-coordinated knowledge.

We define link frequency (If) as the total number of links in the code-to-code relation

context after parsing the current patient's report. The link frequency shows the closeness

of the relationship, the larger the closer.

Type-D relations
Disease axis term
D: Tuberculosis

/-78

..... '._..1_40 /_.f .54 i


T M E F
Lung Granuloma M.tuberculos Fever
S/=376 /=1480 f=378 8
Other axis terms i
----. -.- .- .- .- .- .------ - - -- - - --------------..-. .-------..- ----. -. -. -. -. -. -.- .- .- .- .- .- .-.--- -- -- th e r .m is -te rm ~s
Type-M relations
Figure 5-6. An example of a four-axis-relation post-coordinated knowledge

We can obtain the post-coordinate knowledge by searching the complete SNOMED

equation from documents described in the previous section. Then, we can obtain the link

frequency of each relation between two axes statistically following the induction of

complete knowledge from the incomplete SNOMED equation. We use the link frequency,

discussed in Chapter 6, for conversion of statistical model of post-coordinated knowledge.

5.4 Naive Bayes Model of Post-Coordinated Knowledge

It is possible to create or learn a Bayesian network from the data. This is an

instance of the problem, known in the statistics literature as density estimation. We can

estimate the probability density (that is, ajoint probability distribution) from the data.

When we learn a Bayes network from the data, there are four different cases: structure









known or unknown, and all variables observable or some unobservable. For our case, the

structure is known and some variables are unobservable.

To model "post-coordinatedknowledge," we have several assumptions:

* We consider only the knowledge consisting of a SNOMED equation. Figure 5-7
shows the basic architecture of a SNOMED equation expressed using a Bayesian
network.

* We assume we have complete knowledge before processing a patient's report. The
complete knowledge can be obtained from searching complete SNOMED equations
from the documents space. We call this complete knowledge as a "post-
coordinated knowledge. "

* The "post-coordinated knowledge" consists of combination of the five axes with
the disease axis being mandatory.

* Complete knowledge is unique.

* Each disease is independent.

* The four axes (T, M, E, and F) are independent of each other.

* T, M, E, and F are conditionally dependent upon the instantiation of D.











Figure 5-7. Structure of the post-coordinated knowledge in a Bayesian network

In our case, the structure of a Bayes network is fixed. It has one of the forms shown

in Figure 5-5. We can consider the knowledge complete only if there is disease axis in the

SNOMED equation (i.e., in the document). We use the following algorithm to extract the

knowledge.

1. Look through the documents to find a SNOMED equation in the document having
the complete post-coordinated knowledge form shown in Figure 5-5.









2. Extract only the complete knowledge form from the documents retrieved.

3. Use an expert to verify that the extracted knowledge is correct. Generally, we can
consider the equation to be complete if it contains a "D" axis.

4. Add the extracted and verified knowledge into the system's knowledge the "Post-
coordinated knowledge base" (PCKB).

It is possible that individual document can contain incomplete knowledge due to a

lack of expert knowledge or an error. This means some variables of the "Post-

coordinated knowledge base" (PCKB) are not observable in some documents. In that

case, we must induce the value of the unobserved variables in the complete PCKB. To do

this, we need to estimate the probability values of the PCKB structure's variables.

It is easier to start by estimating P(D). This is computed by counting how many

times D is true (=found positive) in data set (documents) and dividing by n, the total

numbers of documents. To obtain an estimate of the probability that Tis true given that D

is true, we just count the number of cases in which T and D are both true, and divide by

the number of cases in which D is true. The probability of Tgiven not D is similar shown

below.

P(D) (D = true)
n

P(- D) 1- P(D)

( # I (T = true D = true)
# (D = true)

P(T D) #(T = true nD = false)
#(D = false)

There is one problem with this approach. There will be situations where the number

of"D is true," "D is false," or "Tand D are true" cases is 0. In those situations we

calculate a value of 0 for that probability. Because we start from the "base knowledge"









structure, the later case should not occur, but it is possible for the number of"D is true"

or "D is false" cases to be 0. To guard against this, we can apply a "Bayesian correction"

to our estimates. This means, essentially, initializing our counts at 1 rather than at 0. So,

we add a 1 to the count in the numerator, and a value m to the denominator, where m is

the number of possible different values the variable whose probability we are estimating

can have. In our case, all variables are binary, so we add a 2 in the denominator. The new

formula looks like the following.

P(T # (T = true D = true) +1
#(D = true)+ 2

P(T #(T = true cD = false) +
#(D = false)+ 2

Processing documents to obtain PCKB results in m components of PCKB. Each

PCKB has the probability estimations shown in Figure 5-8.

5.5 Summary

We described the Knowledge-base Management Engine (KME) modeling

SNOMED pre-coordinated and post-coordinated knowledge. The pre-coordinated

knowledge is modeled using a semantic network notation. It has synonym, multi-axial,

and hierarchical relationships. The post-coordinated knowledge can be modeled either

statistically or probabilistically. We created the statistical model using the concept of link

frequency that can be obtained from the processing of the documents space. We used the

naive Bayes network as a probabilistic model of the post-coordinated knowledge. The

naive Bayes network model has a simple structure by its independence assumption, while

providing simplistic but acceptable results with its simple structure for calculating the

joint probability distribution, that is post priority of disease.








We describe the Knowledge Conversion Engine (KCE) in the next chapter. The

KCE handles the conversion of knowledge to quantitative values. We call the conversion

process knowledge reduction.


P(tlIdl) P(mlldl) P(elldl) P(flldl)
P(tlI-dl) P(mll-dl) P(ell-dl) P(fll-dl)
Figure 5-8. PCKB component structure and probability estimation.














CHAPTER 6
KNOWLEDGE CONVERSION ENGINE (KCE)

The Knowledge Conversion Engine (KCE) converts the Support Vector Machine

(SVM) document vector to a conceptual document vector reflecting the knowledge of the

SNOMED Semantic Network Knowledge Base (SNN-KB). We start our discussion of

the process with a description of the SVM document vector.

6.1 Support Vector Machine Document Vector

The best-known model in information retrieval is the Vector Space Model (VSM)

(Salton et al. 1989). In the VSM, documents and queries reside in vector space. In this

space, each document can be represented as a linear combination of term vectors. The

definition of the vector space model follows:

Definition 6.1: A document vector for a document di is

di =(wl,,w2,,... w,J )T where
wl > 0 is a weight associated ii th the pair (k, d ) where
k, is an index term, di is a document, and
t is the number of index terms in the whole system.

Definition 6.2: The set of all index terms K is
K = {k,,...,k,} where
t is the number of index terms in the whole system.

Normally the index terms are words contained in the document. The set is usually

confined to only the significant words by eliminating common functional words called

stopwords. The VSM uses the term frequency and the inverse term frequency as a

weighting scheme associated with the document.

Definition 6.3: The weight w,,s > 0 is









w ,, = tf,, x idf where
tf, is the term frequency of term i in documents and

idf = log N (the inverse document frequency) where
n
N is the number of documents in the collection and
n, is the document frequency of term i

The document frequency is the number of documents in which the term occurs.

6.2 Conceptual Document Vector

The SVM document vector uses term frequency and inverse document frequency as

a conceptual imbuement to the information retrieval model. There has been an attempt to

use phrases as index terms instead of words (Mao and Chu, 2002), which gives a

conceptual similarity of phrasal words in the retrieval model. They reported a 16%

increase of retrieval accuracy compared to the stem-based model.

In the Vector Space Model, term vectors are pair-wise orthogonal meaning that

terms are assumed to be independent. There was an attempt to incorporate term

dependencies, which gives semantically rich retrieval results (Billhardt et al. 2004, p.

239). They used a term context vector to reflect the influence of terms in the conceptual

description of other terms. The definition of a term context vector follows:

Definition 6.4: The set of term context vectors Tis
c c21 ... cn

T = 2 C22 n2 where

Cln C2n Cnn j
n is number of terms and
c k represents the influence of term tk on term t,.

Definition 6.5: The term context vector t is the ith column of matrix Twhere

t = (c, cl2,2 ,Cl)T anywhere
n is number of terms and









ck represents the influence of term tk on term t,.

The Knowledge Conversion Engine (KCE) converts relationships within the SNN-

KB into a term context vector. In the following, we discuss how the elements of matrix T

can be obtained from domain-specific knowledge base representation.

6.3 KCE: Knowledge Reduction


Human friendly graph representation
Computationally complex




J Reduction of
Knowledge

(1= ( 11. 1 ll. 1

Computer friendly statistical model
Computationally efficient
Figure 6-1. Knowledge reductions

There are two types of knowledge to convert: pre-coordinated and post-

coordinated knowledge. We reduce the dimension of the knowledge of the pre-

coordinated knowledge to a conceptual document vector. The form of knowledge

expressed by a graph (in our case, a semantic network) is a human friendly form. But it is

computationally complex. We convert that knowledge into a computer friendly and

efficient statistical form. The concept of knowledge reduction is shown in Figure 6-1.









6.4 KCE: Conversion of Pre-Coordinated Knowledge

Three types of relationships exist within the SNN-KB model representing

SNOMED. In the first type, the hierarchical topology relationship, each node has

attributes denoting its characteristics on the hierarchical tree.



L(i)=O
i ) D(i)=6







So ?L(j)=2
^ D(j)=0
Figure 6-2. Attributes of the SNN-KB hierarchical topology relation

L(i) is the level of term i in a knowledge tree. D(i) is the number of descendents of

term node i in the tree. The term influence between i and j is inversely proportional to the

distance, which is the difference of the levels. Having many descendents means that a

node is a more general term than some node having a smaller number of descendents. So

term influence is inversely proportional to the number of descendents. Thus we can

calculate the SNN-KB hierarchical topology relationship between the two terms i and j

as:

Definition 6.6: c~ from the SNN-KB hierarchical topology is
1 1
c = C(Sht) x x log where
d(i, j) D(i) + D(j)
C(Sht) is the coefficientfor the SNOMED hierarchical topology
relation and
d(i, j)= L(i)- L(j) where
L(i) is level of node i and L() is level of node j,
D(i) is number of descendents of node i, and









D() is number of descendents of node j.
For the synonym relations:
Definition 6.7: c, from the SNN-KB synonym relation is
ci = C(Ss) where
C(Ss) is the coefficientfor the SNOMED synonym relationship.

For the multi-axial relations:

Definition 6.8: c, from the SNN-KB multi-axial relation is
c, = C(Sm) where
C(Sm) is the coefficientfor the SNOMED multi-axial relationship.

The value of C(Sht), C(Ss), and C(Sm) should be optimized by simulation.

6.5 KCE: Generating the Conceptual Document Vector

By converting the SNOMED knowledge and domain-specific knowledge to the

term-relation matrix T defined in Definition 6.4, we can transform each initial document

vector d j =(w ,w2,j... ,j ) into a conceptual document vector


cd = (c, ,c2,,., t,J ) using the equation in Definition 6.9 (Billhardt et al. 2004, p.

240).

Definition 6.9: cd, from d, (Definition 6.1) and t, (Definition 6.5) is
n t


cd,- = where

j=1
t, is the term context vector of term t and

is the length of vector t.

The division of the elements in the term context vectors by the length of the vector

is a normalization step.









6.6 KCE: Conversion of the Post-Coordinated Knowledge

Post-coordinated knowledge can be obtained from a user's document (i.e., a

patient's report) after processing all documents in the system. This knowledge cannot be

obtained from the pre-defined SNOMED knowledge base. This knowledge contains noise

because the coding ability including the correctness of the coding of the patient report

varies from pathologist to pathologist.

We can define two kinds of models: statistical and probabilistic.

6.6.1 Statistical Model of Post-Coordinated Knowledge

To compute the statistical model, we first introduce the link frequency (//) to

express the closeness of the relation between terms.

Definition 6.10: The linkfrequency If is
the number of linkages accumulated fom all system document domain-
specific knowledge.

The domain-specific knowledge in pathology consists of the multi-axial relations

that have more importance on the disease axis. In the knowledge of multi-axial disease

centered relationships, relations between axis terms can be divided into two types. We

call relations including a disease as a D-type (Disease related type) relation and the other

relations as an M-type (Multi-axial related type) relation. The reason for separating the

relations is that the disease axis related relations have more meaning than the other

relations. Figure 6.3 shows an example of this domain-specific knowledge model

showing newly defined attributes.

Figure 6.3 describes a relation between disease i and other axis terms: jl,j2,j3, and

j4. The relation between i andjl was found 230 times, which is the link frequency from

the start of the system until now (i.e., since the start of data in the database). Because the








value of link (i, jl) is greater than the other D-relation links, it is more important than the

other links.

Type-D relations
Disease axis term



=230/ / \ \ f78
1 140 -/54



i---- j3 j4
Sf=376 f= 1480 j3 1378 j4

Other axis terms


Type-M relations
Figure 6-3. Example of Domain-Specific Knowledge relations

For the Type-M relations, we can define the term to term relation factor, cy, as.

Definition 6.11: c, from the Domain-specific Type-M relation is


c = I + C(DM)x C where
nn(n 1)
2
C(DM) is the coefficient of the Domain-specific M-type relation,
If is the link fequency between i andj, and
lfc is the link fequency of other relations other than i andj.

Figure 6-4 shows the conversion concepts of type-M relations. The type-M relation

is a sum of the importance of link and averaged influences from other links.

If we look at type-D relations, one disease term has several relations with type-M

nodes. So, we have to consider the influence on one type-D relation to the other type-D









relations. For example, if we calculate the relation factor between node i and j 1, we must

consider the influences from other relations: (i, j2), (i, j3), and (i, j4) to the relation (i, j 1).



T M E F

fl-376 If 1480 If-- 378
Other axis terms

CT Type-M relations







E F

Figure 6-4. Conversion oftype-M relations

For the D-type relations:

Definition 6.12: c, from the Domain-specific Type-D relation, where node i is
disease term and node j is other term, is


c = C(DD)xlJf +C(DDn) c where
D N, -1
C(DD) is the coefficient of the Domain-Specific D-type relation,
If is the link frequency between i andj,
C(DDn) is the coefficient of the Domain-Specific Disease
Neighbor relation,
If is the link frequency other than i andj, and
N, is the number of axis other than diseases in the knowledge-base.

The statistical model of the post-coordinated knowledge can be applied to the

conceptual matrix. This means the knowledge is applied to the document vector

"generally" regardless of each document's situation.









6.6.2 Probabilistic Model of Post-Coordinated Knowledge

We defined the naive Bayes network model of the post-coordinated knowledge in

Section 5.3. After processing documents for post-coordinated knowledge (PCK), we

have n documents and m PCKs.

Each PCK has a specific form shown in Figure 5.6. The object of inference in the

knowledge-based information retrieval model is to find a disease from the given findings

(combinations of T, M, E, and F). Each document does not contain complete PCKs

normally. Because of the lack of expert knowledge, it is impossible to write a complete

form of the PCKs in a patient's report. So we must estimate what kind of disease is most

likely from the given findings in the document. This is the key to improving the

knowledge enhancement of the retrieval process.

We modeled the PCKs using naive Bayes in Section 5.3. We can define the

posterior probability that we are attempting to calculate as:

P(D I t,m,e, f),

where D is the set of diseases that has a relationship with the given findings (t, m, e, and

f) found by searching PCKs. The posterior probability can be solved by Bayes theorem.

P(D)P(t, m, e, f D)
P(D | t, m, e, f) =
p(t,m,e,f)

In practice, we are only interested in the numerator of above fraction, since the

denominator does not depend on D and the values of the t, m, e, andfthat are given, so

the denominator is constant.

By the independence assumption, we can rewrite the fraction as:

1n
P(D I t,m,e,f) P(D) P( D)
z 1-1









where F, is set of findings.

The post-coordinated knowledge has specific relations with the individual

documents. Actually, the individual knowledge is defined from the specific contents of

each document, so we cannot use knowledge reduction in this case. Knowledge reduction

handles general knowledge conversion cases. So we have to apply the post-coordinated

knowledge to each document: more specifically to each individual document vector.

We can classify several cases for conversion of post-coordinated knowledge. Refer

to Figure 5-5 for the classification of post-coordinated knowledge. We use PCKB-a for a

one axis relation, PCKB-b for a two axes relations, PCKB-c for a three axes relation, and

PCKB-d for a four axes relationship.

Case 1: The document contains all four axes, for example (t, m, e, andf).
We must find the probability of based upon the existence of (t,m,e,J). This
is performed by searching PCKB-d. Searching PCKB-a, PCKB-b, or PCKB-
c is not necessary because those have less information. We can obtain only
one component of knowledge from PCKB because with the five axes of
information, the knowledge is complete and unique.

Case 2: The document contains three axes all except d.
Figure 6-5 shows an example of this case. Here, we must compute the
probability of each possible diseases, then another axis's, i.e.,
P(dl t, m, e) (1)
P(d2 t, m, e) (2)
after finding the possible post-coordinate knowledge from PCKB-c and
PCKB-d. Searching PCKB-d is required because PCKB-c can be inclusive
knowledge of PCKB-d.


Figure 6-5. Examples of case2









We know already P(dl), P(d2), P(tldl), P(mldl), P(eldl), P(flldl), P(tld2), P(mld2),

P(eld2), and P(f2ld2).

By the naive Bayes theorem, the posterior probability (1) and (2) can be calculated

and compared by:

P(d t, m, e)= P(dl)jP(F I dl) = aP(dl)P(t dl)P(m I dl)P(e dl)
z -

P(d2 t, m,e)= P(d2)jP(F Id2)=aP(d2)P(t d2)P(m d2)P(e d2).
Z 1=1

Then, we can augment the document vector according to the relative normalized

value ofP(dl]t, m, e) and P(d2 t, m, e) with some coefficient.

The complexity of this algorithm, is O(mn) where n is the number of documents

and m is a count of the post-coordinate knowledge.

Case 3 is the case when two axes relations found in document and case 4 is the case

when one axis relation found in document. The calculation is as straightforward as in

case 2.

6.6 SVM IR Engine: Document Retrieval

After the process of converting the document vector to the conceptual document

vector, the system can start accepting queries. A query is expressed identical to a

document vector where the query terms are the vector elements. The query vector q is

compared with the conceptual document vector cd, using the cosine similarity measure.

Definition 6.14: The similarity between q and cd, is
--- q7.c- Zin
q q:i, 4 cd
cos(q, cd) =-
qd d j 2j









The similarity measure produces a ranked list of relevant documents related to the

query.

6.7 Summary

We described the details of Knowledge-base Conversion Engine (KCE). The KCE

reduces knowledge expressed by a semantic network or a Bayesian network into

quantitative values to provide efficiency in the retrieval process. Conversion of the

knowledge is called knowledge reduction because the reduction process reduces the

graphical knowledge into a two-dimensional value representing the number of relations

between the two terms.

The conversion of Bayesian network knowledge is done by directly applying the

inferred certainty value into a document vector. This process applies the knowledge into

the individual documents, which is called personalized knowledge application, while the

conversion of pre-coordinated knowledge is general application of knowledge.

In the next chapter, we describe the result of performance evaluation of the

developed the knowledge base information retrieval model.















CHAPTER 7
PERFORMANCE EVALUATION

7.1 Simulation Parameters

In our experiment, we used recall and precision metrics for evaluation of the

performance as explained in Section 2.4. We can consider the gain of performance if the

recall-precision graph shape goes to upper right direction shown in Figure 7-1, because in

ideal case, the precision should be 100% when the recall is 100%.



100 Performance
Increasing

Precision
50

Performance
Decreasing

0
10 Recall 60
Figure 7-1. Performance evaluation metrics

To calculate precision and recall, we must know the exact relationship between

each document and the query. An expert should determine this, so it is impossible to

evaluate the relevancy between documents and a query if the set is big. In our case, the

total number of documents is nearly one half million. We selected 2000 case documents

signed by a top expert, because those documents should have a low error rate in

describing post-coordinate knowledge. Then we selected 261 cases randomly among

2000 cases because we need to reduce the size of set to be able to examine relevancy by

human expert. The selected 261 cases were examined for their relevancy with queries









"membranous nephropathy lupus" and nephroticc syndrome." Our expert rated the

relevancy between each document and the query as "Positive," "Neutral," and

"Negative."

In this chapter, we call query "membranous nephropathy lupus" as Q1 and

nephroticc syndrome" as Q2. Table 7-1 shows the result of evaluation for the 261

documents.

Table 7-1. Relevancy check result of 261 simulation documents
Query # of positive # of neutral # of negative Total relevant
(positive+neutral)
Q1 24 95 142 119
Q2 23 90 148 113


7.2 Simulation Result

7.2.1 Performance Evaluation with Pre-Coordinated Knowledge

Figure 7-2 shows the result of the query "membranous nephropathy lupus" on the

positive cases. This graph shows some degradation of performance for the knowledge

based information retrieval (KBIR) model compared with the support vector machine

(SVM). We can think of the KBIR having the same effect as query expansion. The KBIR

expands the document vector instead of the query vector. If the knowledge has synonyms,

the KBIR expands the document vector to include synonyms of the query "membranous

nephropathy lupus." This causes an expansion to a somewhat broader range of

knowledge. For example, "membranous" can be expanded to a more general term, so the

degradation on the positive case may be caused by a general expansion of the knowledge

of KBIR. This can be explained more by looking at the results of query 1 if we included

the neutral cases in the performance evaluation as shown in Figure 7-3.












100.0%

90.0%

80.0%

70.0%

60.0%

50.0%

40.0%

30.0%

20.0%

10.0%

0.0%
0


20.0% 40.0% 60.0% 80.0% 100.0%

Recall


Figure 7-2. Comparison of performance for query 1 on positive cases.


20.0% 40.0% 60.0%

Recall


KBIR
_VSM
Synset
CrossRef
- -- SomeRel
-- Syn+CrossRef


80.0% 100.0%


Figure 7-3. Evaluation results of query 1 including the neutral cases.


---


.0%


100.0%

90.0%

80.0%

70.0%

60.0%

50.0%

40.0%

30.0%

20.0%

10.0%


0.0% -
0.0%


--KBIR
_-VSM
- -Synset
CrossRef
-- SomeRel











100.0%

95.0%


90.0% KBIR
=\ -VSM
S85.0% Synonym
SCrossRef
80.0% --- SomeRel

75.0%

70.0% -
0.0% 20.0% 40.0% 60.0% 80.0% 100.0%

Recall

Figure 7-4. Evaluation results for query 2 for the positive cases


100.0%
90.0%
80.0%
70.0%
60.0%
50.0%
40.0%
30.0%
20.0%
10.0%
0.0%


KBIR
-VSM
Synonym
CrossRef
------ SomeRel


0.0% 20.0% 40.0% 60.0% 80.0% 100.0%
Recall

Figure 7-5. Evaluation results for query 2 including the neutral cases


-- ~









These results show big gain in performance when compared with the degradation

that occurs with only the positive cases. If we look at the result more generally, meaning

there is an importance to the neutral cases, the performance evaluation result shows

promising result. The gain can be explained by the expansion of knowledge in the

document vector.

If we look at the result of VSM, the resulting documents only have to contain one

of the query terms: membranous, nephropathy, or lupus. But KBIR retrieves some

documents that do not contain any query words because the document vector was

extended to contain terms related to the existing terms in these documents. This increases

the recall rate. If we look at precision, this starts to make sense when we consider the

results more generally.

Figure 7-4 is the result of query 2, nephroticc syndrome" on just the positive cases.

When this is contrasted with the evaluation of query 1 on positive cases, the results show

a performance gain. This can be explained by the characteristics of KBIR's knowledge

management. Because the number of terms in query 2 is smaller than in query 1, the

amount of expanded knowledge for query 2 is less than for query 1. This means that

knowledge expansion for queries having fewer query terms tends to have smaller error

rates compared to queries having many terms.

If we look at the performance evaluation results of query 2 including the neutral

cases shown in Figure 7-5, they show a lower performance gain when compared to the

results of query 1. This can be explained also by the small expansion of knowledge

caused by lower number of terms in the query.









If we look at the effects of each relationship on KBIR performance, we can say the

result of KBIR performance is the sum of each relation: synonym, cross reference, and

some relation. Normally, synonym relations do not show a significant contribution but

cross reference relations (i.e., relations between SNOMED axes) show a significant

contribution in performance. This can be explained as each document's concept can be

expressed by a SNOMED equation, so the relationship between concepts is more

important than just the synonym relations between terms. Table 7-2 shows quantitative

values of performance gain for the pre-coordinated knowledge addition compared to the

VSM method.

Table 7-2. Value of performance gain of pre-coordinated knowledge compared to VSM
Query Performance gain (%)
Query 1 39.6
Query 2 20.6
Average 30.1


7.2.2 Performance Evaluation with Naive Bayes Post-Coordinated Knowledge

Figure 7-6 shows the performance gain when we use the naive Bayes post-

coordinated knowledge for queryl and Figure 7-7 for query 2. Table 7-3 shows the

quantitative value of performance gain compared to VSM and pre-coordinated

knowledge.

Table 7-3. Value of performance gain of post-coordinated knowledge


Query Performance gain (%) Performance gain (%)
Compared to pre- Compared to VSM
coordinated knowledge
Query 1 7.0 47.0
Query 2 8.2 28.8
Average 7.6 37.9


I ne results snow nearly te same percentage ot improvement compared to tne pre-

coordinated knowledge case and different gain compared to the VSM case.


rl












100.0%

90.0%

80.0%

70.0%

60.0%

50.0%
4.%- -
40.0%

30.0%

20.0%

10.0%

0.0%
0.0%


With PK
- Without PK
VSM


80.0% 100.0%


Figure 7-6. Evaluation results of query 1 including post-coordinated knowledge


100.0%

90.0%

80.0%

70.0%

60.0%

50.0% -

40.0% -

30.0%

20.0%

10.0%

0.0% -
0.0% 20.0% 40.0% 60.0% 80.0% 100.0%

Recall


With PK
Without PK
........ V SM


Figure 7-7. Evaluation results of query 2 including post-coordinated knowledge


20.0% 40.0% 60.0%
Recall


-







The reason is straightforward for the effects of knowledge application of our model
explained in previous section.
7.2.3 Performance of Statistical Post-Coordinate Knowledge Model
There is no significant performance improvement on this model as seen on Figure
7-8. We thought the statistical model of post-coordinated knowledge is general
knowledge that can be applicable to all documents regardless of its own semantics of each
document. The result shows the assumption is incorrect.


100.0%
90.0%
80.0%
70.0%
60.0%
50.0%
40.0%
30.0%
20.0%
10.0%
0.0%


With PK
- Without PK
.... SVM


0.0% 20.0% 40.0% 60.0% 80.0% 100.0%
Recall
Figure 7-8. Evaluation results of query 1 including statistical post-coordinated
knowledge
7.3 Summary
We showed the results of performance evaluation for our knowledge-based
information retrieval model showing the effects of each pre-coordinated knowledge and
post-coordinated knowledge.


i









The results show a nearly 30% increase for pre-coordinated knowledge application

and 37% increase for post-coordinated knowledge application compared to VSM. These

increases occur even though the real-time speed of processing is comparable to VSM.

We applied the statistical model of post-coordinated knowledge to all documents

evenly by inserting computed relations into the term-context matrix. We assumed the

statistical post-coordinated knowledge is general knowledge that can be applied evenly.

But from the simulation results of the statistical model, we can conclude that the post-

coordinated knowledge is personalized knowledge that should be applied to each

document separately. We applied the naive Bayes model based knowledge to each

document's term vector separately.

The next chapter concludes our research summarizing contributions and identifying

future work.














CHAPTER 8
CONCLUSION

In this dissertation, we have shown significant progress towards developing an

information retrieval model augmented by a knowledge base. We created a knowledge

based information retrieval (KBIR) model showing meaningful performance gain while

providing same speed performance in the retrieval process. We summarize our

contributions in Section 8.1 and discuss directions for future work in Section 8.2.

8.1 Contributions

The objective of this dissertation was to design an intelligent information retrieval

model producing knowledge infused answers to users by incorporating a domain-specific

ontology in the knowledge-base using a computationally efficient knowledge conversion

method. The main contributions of the dissertation to information retrieval research are as

follows:

Knowledge reduction to statistical model: The developed information retrieval

model is a knowledge-based information retrieval model. Unlike the other models, which

perform an ontology level information retrieval task such as an ontology comparison and

an ontological query expansion, the proposed model reduces the knowledge level

represented by the knowledge base to a statistical model such as the vector space model's

document vector shown in Figure 8-1. We used semantic networks for pre-defined

knowledge and naive Bayes networks for post-coordinated knowledge. Those graphical

knowledge representations are human friendly and easily understandable to human but

computationally complex. The reduced statistical form of knowledge, such as a









conceptual document vector, is not human friendly but is computer friendly and

computationally efficient.


Figure 8-1. Knowledge reduction to statistical model


OFF-LINE
CALCULATION


Figure 8-2. Off-line application of knowledge

Off-line application of knowledge: Using knowledge reduction enables the off-

line processing of the application (calculation) of knowledge to the information retrieval

procedure shown in Figure 8-2. Only the conceptual document vector, which can be









obtained from the document vector and the knowledge base, is involved in the on-line

process of producing ranked results by comparing a user's query and the documents.

Inverse query expansion: The result of our knowledge-based information retrieval

model is very similar to that of query expansion or latent semantic. Unlike those models,

which calculate part of the knowledge during the retrieving process, our model does its

processing offline, giving the same effect with a lower computational burden.

Applicability to general open domain: Even if the proposed model uses domain-

specific knowledge, this model can be used in an open-domain application if some types

of knowledge bases are supported. One possible candidate for the open domain

knowledge base is WordNet, which has a thesaurus and relations from the natural

language domain.

Flexibility on the knowledge representation: We defined some examples of

knowledge reduction methods using a semantic network. The semantic network is an

example of a knowledge representation, which is one of artificial intelligence's field

handling ontologies. Our model has flexibility on the type of knowledge representation if

we can define the knowledge reduction scheme of the selected knowledge representation

model. In our model, we used a naive Bayes network for representing post-coordinated

knowledge. It has classification ability with less computational complexity and a

reasonable approximation of conditional independence.

8.2 Future Work

One task that needs completing is the modeling of the hierarchical knowledge. To

adequately model the hierarchy in the Pathology domain requires that we refine the

hierarchical relationship by looking at the SNOMED book. The reason is that the

database storing the SNOMED notations is incomplete in exactly defining the









hierarchical relationships. We need to make complete sets of the hierarchy of over 50,000

semantic relations existing in the SNOMED book to apply the hierarchy in our

knowledge-base IR model. There is a possibility to use the current version of SNOMED,

SNOMED-CT, that provides a more profound and accurate set of relationships in the

pathology domain. This should be handled as a separate project because of the size and

depth of the work. We can induce the result when we add the hierarchical knowledge in

our model by looking at the results of other relation additions. The trends of relation

additions show a higher degradation of performance if the relations are more general. We

think that the hierarchical relationships will add a larger number of relations to the term

matrix than the other relations, resulting in some degradation on the precision, but with a

gain in recall.

A second extension of this work is to apply our model to the open domain

information retrieval process. Using WordNet as a knowledge source, we can see if there

is a performance gain in general domain information retrieval. Extracting knowledge

automatically from given documents to use as a knowledge source for the information

retrieval process is a possible approach towards applying our model to the general open

domain.

Finally, we used the naive Bayes network for modeling post-coordinated

knowledge. The naive Bayes model assumes independence among findings. Several

Bayesian network based models exist providing dependence model among findings. Even

though several papers identify that the naive Bayes model shows acceptable performance

in its simple form, it would be worthwhile to compare the performance between the naive

Bayes and other models providing the dependency relations between findings.















APPENDIX A
PRIMARY TERMS WHICH ARE THE BASIS FOR THE DB ATTRIBUTE

Table A-1. Primary terms for APDB
Terms Roles Etc
Format:NNNN-YY-T This number also
SURG PATH NO NNNN: Serial number distinct in one year, shown at the end of
SURGICAL digit width may vary the line having
PATHOLOGY NO# YY: year expressed in two digit format:
ACC. # T: Type = { C, S, O, G, M } YYTNNNNN###Y
ACC# Type YMMDD
CYTOLOGY NO "C" Consultation Rpt
"S" in-house surgical Rpt
Patient name
NAME
Format: Last, First, Middle, Suffix
TEST NO Test number
SPECIMEN NO
SPECIMEN Specimen number
SPECIMEN
MED REC NO 6 digit unique number of each hospital
Medical Record # format: NN-NN-NN may vary
ROOM WARD
Room number
WARD Patient location
Age of patient
Format: NN [Y|M|D]
AGE NN number
Y represent year
M month
D day
SEX* Sex of patient
SEX
Format: {M|F}
Service date
DATE
Service Date Format: Month Day, Year
Example: JANUARY 07, 1981
PHYS
PHYSICIAN
rrin P ii Referring Physician or Surgeon
Referring Physician
Surgeon
REPORT TYPE Example: S1 Surgical
Date obtained
SERVICE
Date received
Date Obtained Date obtained
Date Received Date received










Table A-i Continued
Terms Roles Etc
HISTORY
CLTRICAL Clinical history
CLINICAL Long text
HISTORY Specimen(s) submitted/ Procedures ordered
Specimen submitted *
GROSS
DESCRIPTION
MICROSCOPIC Light Microscopy
MICROSCOPIC
DESCRPTIO* Immunofluorescence microscopy
DESCRIPTION
MICROSCOPTIC Electron microscopy
DESCRIPTION Other tests: e.g. included cytogenetics,
molecular biology, or flow cytometry data
DIAGNOSIS Bone marrow, aspiration: No lymphoma
DIAGNOSIS detected
detected
COMMENT *
PATHOLOGIST *
Diagnostic/Retrieval codes
Modifier codes
Transaction codes: JP/whd
RETRIEVAL CODES
Date of transcription: 03/23/99
Electronic signatures
Date Electronically signed out




















APPENDIX B
SNOMED STATISTICS


Table B-1. Partial list of T code
P(code/total
Name Number disease) P(code/documents)


T8X330
T8X310
TOOXXO
T83000
T8X210
TOX000
T83300
T82
T83320
T74000
T84000
T8X
T86800
T2Y030
T88100
T06000
T01000
T8X320
T7X100
T56000
T2Y414
T67000
T71000
T77100
T63000
T32010
T80100
T6X940
T86120
T04030
T86110
T04020
T57000
TOXOO
T08000
T82900
T81000
T88960
T66000


142850
64010
53701
33989
22408
16728
14706
14232
13621
13125
12585
11307
8341
7327
7299
6825
6449
5778
5648
5495
5407
4980
4853
4597
4457
4185
4137
3679
3609
3543
3541
3523
3514
3301
3207
2875
2863
2746
2726


0.203217335
0.091060144
0.076394639
0.048352496
0.031877452
0.023797127
0.020920645
0.020246336
0.019377132
0.018671526
0.017903326
0.016085253
0.011865844
0.010423335
0.010383502
0.009709194
0.009174299
0.008219739
0.008034802
0.007817146
0.007691958
0.007084511
0.006903841
0.006539658
0.006340495
0.005953550
0.005885265
0.005233718
0.005134136
0.005040245
0.005037400
0.005011793
0.004998990
0.004695978
0.004562254
0.004089953
0.004072882
0.003906439
0.003877987


0.491999201
0.220461105
0.184955191
0.117063779
0.077176886
0.057614019
0.050649914
0.049017379
0.046912993
0.045204687
0.043344837
0.038943192
0.028727794
0.025235409
0.025138972
0.023506437
0.022211431
0.019900395
0.019452653
0.018925696
0.018622609
0.017151950
0.016714541
0.015832834
0.015350651
0.014413837
0.014248517
0.012671089
0.012429997
0.012202682
0.012195794
0.012133799
0.012102801
0.011369194
0.011045442
0.009901979
0.009860649
0.009457682
0.009388798