|
122 | 122 | * thousands of trigrams would be slow, and would likely produce so many
|
123 | 123 | * false positives that we would have to traverse a large fraction of the
|
124 | 124 | * index, the graph is simplified further in a lossy fashion by removing
|
125 |
| - * color trigrams until the number of trigrams after expansion is below |
126 |
| - * the MAX_TRGM_COUNT threshold. When a color trigram is removed, the states |
127 |
| - * connected by any arcs labelled with that trigram are merged. |
| 125 | + * color trigrams. When a color trigram is removed, the states connected by |
| 126 | + * any arcs labelled with that trigram are merged. |
| 127 | + * |
| 128 | + * Trigrams do not all have equivalent value for searching: some of them are |
| 129 | + * more frequent and some of them are less frequent. Ideally, we would like |
| 130 | + * to know the distribution of trigrams, but we don't. But because of padding |
| 131 | + * we know for sure that the empty character is more frequent than others, |
| 132 | + * so we can penalize trigrams according to presence of whitespace. The |
| 133 | + * penalty assigned to each color trigram is the number of simple trigrams |
| 134 | + * it would produce, times the penalties[] multiplier associated with its |
| 135 | + * whitespace content. (The penalties[] constants were calculated by analysis |
| 136 | + * of some real-life text.) We eliminate color trigrams starting with the |
| 137 | + * highest-penalty one, until we get to a total penalty of no more than |
| 138 | + * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would |
| 139 | + * lead to merging the initial and final states, so we may not be able to |
| 140 | + * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than |
| 141 | + * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail. |
128 | 142 | *
|
129 | 143 | * 4) Pack the graph into a compact representation
|
130 | 144 | * -----------------------------------------------
|
|
199 | 213 | * MAX_EXPANDED_STATES - How many states we allow in expanded graph
|
200 | 214 | * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
|
201 | 215 | * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
|
| 216 | + * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties |
202 | 217 | * COLOR_COUNT_LIMIT - Maximum number of characters per color
|
203 | 218 | */
|
204 | 219 | #define MAX_EXPANDED_STATES 128
|
205 | 220 | #define MAX_EXPANDED_ARCS 1024
|
206 | 221 | #define MAX_TRGM_COUNT 256
|
| 222 | +#define WISH_TRGM_PENALTY 16 |
207 | 223 | #define COLOR_COUNT_LIMIT 256
|
208 | 224 |
|
| 225 | +/* |
| 226 | + * Penalty multipliers for trigram counts depending on whitespace contents. |
| 227 | + * Numbers based on analysis of real-life texts. |
| 228 | + */ |
| 229 | +const float4 penalties[8] = { |
| 230 | + 1.0, /* "aaa" */ |
| 231 | + 3.5, /* "aa " */ |
| 232 | + 0.0, /* "a a" (impossible) */ |
| 233 | + 0.0, /* "a " (impossible) */ |
| 234 | + 4.2, /* " aa" */ |
| 235 | + 2.1, /* " a " */ |
| 236 | + 25.0, /* " a" */ |
| 237 | + 0.0 /* " " (impossible) */ |
| 238 | +}; |
| 239 | + |
209 | 240 | /* Struct representing a single pg_wchar, converted back to multibyte form */
|
210 | 241 | typedef struct
|
211 | 242 | {
|
@@ -339,6 +370,7 @@ typedef struct
|
339 | 370 | ColorTrgm ctrgm;
|
340 | 371 | int number;
|
341 | 372 | int count;
|
| 373 | + float4 penalty; |
342 | 374 | bool expanded;
|
343 | 375 | List *arcs;
|
344 | 376 | } ColorTrgmInfo;
|
@@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
|
459 | 491 | static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
|
460 | 492 | static void mergeStates(TrgmState *state1, TrgmState *state2);
|
461 | 493 | static int colorTrgmInfoCmp(const void *p1, const void *p2);
|
462 |
| -static int colorTrgmInfoCountCmp(const void *p1, const void *p2); |
| 494 | +static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2); |
463 | 495 | static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
|
464 | 496 | static int packArcInfoCmp(const void *a1, const void *a2);
|
465 | 497 |
|
@@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
|
1424 | 1456 | TrgmState *state;
|
1425 | 1457 | ColorTrgmInfo *colorTrgms;
|
1426 | 1458 | int64 totalTrgmCount;
|
| 1459 | + float4 totalTrgmPenalty; |
1427 | 1460 | int number;
|
1428 | 1461 |
|
1429 | 1462 | /* Collect color trigrams from all arcs */
|
@@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
|
1482 | 1515 | }
|
1483 | 1516 |
|
1484 | 1517 | /*
|
1485 |
| - * Count number of simple trigrams generated by each color trigram. |
| 1518 | + * Count number of simple trigrams generated by each color trigram, and |
| 1519 | + * also compute a penalty value, which is the number of simple trigrams |
| 1520 | + * times a multiplier that depends on its whitespace content. |
1486 | 1521 | *
|
1487 | 1522 | * Note: per-color-trigram counts cannot overflow an int so long as
|
1488 | 1523 | * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
|
1489 | 1524 | * 1290. However, the grand total totalTrgmCount might conceivably
|
1490 |
| - * overflow an int, so we use int64 for that within this routine. |
| 1525 | + * overflow an int, so we use int64 for that within this routine. Also, |
| 1526 | + * penalties are calculated in float4 arithmetic to avoid any overflow |
| 1527 | + * worries. |
1491 | 1528 | */
|
1492 | 1529 | totalTrgmCount = 0;
|
| 1530 | + totalTrgmPenalty = 0.0f; |
1493 | 1531 | for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
|
1494 | 1532 | {
|
1495 | 1533 | ColorTrgmInfo *trgmInfo = &colorTrgms[i];
|
1496 | 1534 | int j,
|
1497 |
| - count = 1; |
| 1535 | + count = 1, |
| 1536 | + typeIndex = 0; |
1498 | 1537 |
|
1499 | 1538 | for (j = 0; j < 3; j++)
|
1500 | 1539 | {
|
1501 | 1540 | TrgmColor c = trgmInfo->ctrgm.colors[j];
|
1502 | 1541 |
|
1503 |
| - if (c != COLOR_BLANK) |
| 1542 | + typeIndex *= 2; |
| 1543 | + if (c == COLOR_BLANK) |
| 1544 | + typeIndex++; |
| 1545 | + else |
1504 | 1546 | count *= trgmNFA->colorInfo[c].wordCharsCount;
|
1505 | 1547 | }
|
1506 | 1548 | trgmInfo->count = count;
|
1507 | 1549 | totalTrgmCount += count;
|
| 1550 | + trgmInfo->penalty = penalties[typeIndex] * (float4) count; |
| 1551 | + totalTrgmPenalty += trgmInfo->penalty; |
1508 | 1552 | }
|
1509 | 1553 |
|
1510 |
| - /* Sort color trigrams in descending order of simple trigram counts */ |
| 1554 | + /* Sort color trigrams in descending order of their penalties */ |
1511 | 1555 | qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
|
1512 |
| - colorTrgmInfoCountCmp); |
| 1556 | + colorTrgmInfoPenaltyCmp); |
1513 | 1557 |
|
1514 | 1558 | /*
|
1515 |
| - * Remove color trigrams from the graph so long as total number of simple |
1516 |
| - * trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams |
1517 |
| - * with the most associated simple trigrams, since those are the most |
1518 |
| - * promising for reducing the total number of simple trigrams. When |
1519 |
| - * removing a color trigram we have to merge states connected by arcs |
1520 |
| - * labeled with that trigram. It's necessary to not merge initial and |
1521 |
| - * final states, because our graph becomes useless if that happens; so we |
1522 |
| - * cannot always remove the trigram we'd prefer to. |
| 1559 | + * Remove color trigrams from the graph so long as total penalty of color |
| 1560 | + * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to |
| 1561 | + * WISH_TRGM_PENALTY, it's OK so long as total count is no more than |
| 1562 | + * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher |
| 1563 | + * penalty, since those are the most promising for reducing the total |
| 1564 | + * penalty. When removing a color trigram we have to merge states |
| 1565 | + * connected by arcs labeled with that trigram. It's necessary to not |
| 1566 | + * merge initial and final states, because our graph becomes useless if |
| 1567 | + * that happens; so we cannot always remove the trigram we'd prefer to. |
1523 | 1568 | */
|
1524 |
| - for (i = 0; |
1525 |
| - (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT); |
1526 |
| - i++) |
| 1569 | + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) |
1527 | 1570 | {
|
1528 | 1571 | ColorTrgmInfo *trgmInfo = &colorTrgms[i];
|
1529 | 1572 | bool canRemove = true;
|
1530 | 1573 | ListCell *cell;
|
1531 | 1574 |
|
| 1575 | + /* Done if we've reached the target */ |
| 1576 | + if (totalTrgmPenalty <= WISH_TRGM_PENALTY) |
| 1577 | + break; |
| 1578 | + |
1532 | 1579 | /*
|
1533 | 1580 | * Does any arc of this color trigram connect initial and final
|
1534 | 1581 | * states? If so we can't remove it.
|
@@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
|
1570 | 1617 | mergeStates(source, target);
|
1571 | 1618 | }
|
1572 | 1619 |
|
1573 |
| - /* Mark trigram unexpanded, and update totalTrgmCount */ |
| 1620 | + /* Mark trigram unexpanded, and update totals */ |
1574 | 1621 | trgmInfo->expanded = false;
|
1575 | 1622 | totalTrgmCount -= trgmInfo->count;
|
| 1623 | + totalTrgmPenalty -= trgmInfo->penalty; |
1576 | 1624 | }
|
1577 | 1625 |
|
1578 | 1626 | /* Did we succeed in fitting into MAX_TRGM_COUNT? */
|
@@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2)
|
1746 | 1794 |
|
1747 | 1795 | /*
|
1748 | 1796 | * Compare function for sorting color trigrams in descending order of
|
1749 |
| - * their simple trigrams counts. |
| 1797 | + * their penalty fields. |
1750 | 1798 | */
|
1751 | 1799 | static int
|
1752 |
| -colorTrgmInfoCountCmp(const void *p1, const void *p2) |
| 1800 | +colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) |
1753 | 1801 | {
|
1754 |
| - const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; |
1755 |
| - const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; |
| 1802 | + float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty; |
| 1803 | + float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty; |
1756 | 1804 |
|
1757 |
| - if (c1->count < c2->count) |
| 1805 | + if (penalty1 < penalty2) |
1758 | 1806 | return 1;
|
1759 |
| - else if (c1->count == c2->count) |
| 1807 | + else if (penalty1 == penalty2) |
1760 | 1808 | return 0;
|
1761 | 1809 | else
|
1762 | 1810 | return -1;
|
|
0 commit comments