정규화
업데이트: 2007년 11월
일부 유니코드 문자에는 조합 또는 합성 유니코드 문자의 집합으로 구성된 여러 개의 동일한 이진 표현이 있습니다. 단일 문자가 여러 가지로 표현될 수 있으므로 검색, 정렬, 비교 및 기타 작업이 복잡해집니다.
유니코드 표준에서는 문자의 동일한 이진 표현 중 어느 것을 전달해도 단일 이진 표현을 반환하는 정규화라는 프로세스를 정의합니다. 정규화에는 각각 다른 규칙을 따르는 정규화 형식이라는 몇 가지 알고리즘이 사용됩니다. 현재 .NET Framework에서는 유니코드 정규화 형식 C, D, KC 및 KD를 지원합니다.
참고: |
---|
동일한 정규화 형식으로 정규화된 두 문자열은 서수 비교(문자 단위 이진 비교)를 사용하여 비교할 수 있습니다. |
.NET Framework에서 지원하는 정규화 형식에 대한 자세한 정보는 NormalizationForm을 참조하십시오. 정규화, 문자 분해 및 동일성에 대한 자세한 정보는 Unicode 홈 사이트에서 Unicode Standard Annex #15, "Unicode Normalization Forms"를 참조하십시오.
문자열 정규화
응용 프로그램에서는 기본적으로 정규화 형식 C로 정규화된 새 문자열을 반환하는 String 개체의 String.Normalize 메서드를 사용해야 합니다. 또는 String 개체의 String.Normalize 메서드를 사용하고 NormalizationForm 값을 지정하여 정규화 형식 C, D, KC 또는 KD로 정규화된 새 문자열을 반환할 수도 있습니다.
문자열의 정규화 여부 확인
응용 프로그램에서 String 개체의 String.IsNormalized 메서드를 사용하여 개체의 문자열 값이 정규화 형식 C로 정규화되었는지 확인할 수 있습니다. 또는 String 개체의 String.IsNormalized 메서드를 사용하고 특정 NormalizationForm 값을 지정하여 개체의 문자열 값이 정규화 형식 C, D, KC 또는 KD로 정규화되었는지 확인할 수도 있습니다.
예제
다음 코드 예제에서는 IsNormalized 및 Normalize 메서드를 보여 줍니다. 이 코드 예제에서는 원래 문자열이 네 가지 정규화 형식 중 하나인지 테스트하고, 원래 문자열에 각 정규화 형식을 적용한 버전을 만들고, 정규화된 각 문자열이 의도된 정규화 형식인지 테스트한 다음 정규화된 각 문자열에 있는 모든 문자의 16진수 코드 포인트를 표시합니다.
' This example demonstrates the String.Normalize method
' and the String.IsNormalized method
Imports System
Imports System.Text
Imports Microsoft.VisualBasic
Class Sample
Public Shared Sub Main()
' Character c; combining characters acute and cedilla; character 3/4
Dim s1 = New [String](New Char() {ChrW(&H0063), ChrW(&H0301), ChrW(&H0327), ChrW(&H00BE)})
Dim s2 As String = Nothing
Dim divider = New [String]("-"c, 80)
divider = [String].Concat(Environment.NewLine, divider, Environment.NewLine)
Try
Show("s1", s1)
Console.WriteLine()
Console.WriteLine("U+0063 = LATIN SMALL LETTER C")
Console.WriteLine("U+0301 = COMBINING ACUTE ACCENT")
Console.WriteLine("U+0327 = COMBINING CEDILLA")
Console.WriteLine("U+00BE = VULGAR FRACTION THREE QUARTERS")
Console.WriteLine(divider)
Console.WriteLine("A1) Is s1 normalized to the default form (Form C)?: {0}", s1.IsNormalized())
Console.WriteLine("A2) Is s1 normalized to Form C?: {0}", s1.IsNormalized(NormalizationForm.FormC))
Console.WriteLine("A3) Is s1 normalized to Form D?: {0}", s1.IsNormalized(NormalizationForm.FormD))
Console.WriteLine("A4) Is s1 normalized to Form KC?: {0}", s1.IsNormalized(NormalizationForm.FormKC))
Console.WriteLine("A5) Is s1 normalized to Form KD?: {0}", s1.IsNormalized(NormalizationForm.FormKD))
Console.WriteLine(divider)
Console.WriteLine("Set string s2 to each normalized form of string s1.")
Console.WriteLine()
Console.WriteLine("U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE")
Console.WriteLine("U+0033 = DIGIT THREE")
Console.WriteLine("U+2044 = FRACTION SLASH")
Console.WriteLine("U+0034 = DIGIT FOUR")
Console.WriteLine(divider)
s2 = s1.Normalize()
Console.Write("B1) Is s2 normalized to the default form (Form C)?: ")
Console.WriteLine(s2.IsNormalized())
Show("s2", s2)
Console.WriteLine()
s2 = s1.Normalize(NormalizationForm.FormC)
Console.Write("B2) Is s2 normalized to Form C?: ")
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormC))
Show("s2", s2)
Console.WriteLine()
s2 = s1.Normalize(NormalizationForm.FormD)
Console.Write("B3) Is s2 normalized to Form D?: ")
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormD))
Show("s2", s2)
Console.WriteLine()
s2 = s1.Normalize(NormalizationForm.FormKC)
Console.Write("B4) Is s2 normalized to Form KC?: ")
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKC))
Show("s2", s2)
Console.WriteLine()
s2 = s1.Normalize(NormalizationForm.FormKD)
Console.Write("B5) Is s2 normalized to Form KD?: ")
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKD))
Show("s2", s2)
Console.WriteLine()
Catch e As Exception
Console.WriteLine(e.Message)
End Try
End Sub 'Main
Private Shared Sub Show(title As String, s As String)
Console.Write("Characters in string {0} = ", title)
Dim x As Char
For Each x In s.ToCharArray()
Console.Write("{0:X4} ", AscW(x))
Next x
Console.WriteLine()
End Sub 'Show
End Class 'Sample
'
'This example produces the following results:
'
'Characters in string s1 = 0063 0301 0327 00BE
'
'U+0063 = LATIN SMALL LETTER C
'U+0301 = COMBINING ACUTE ACCENT
'U+0327 = COMBINING CEDILLA
'U+00BE = VULGAR FRACTION THREE QUARTERS
'
'--------------------------------------------------------------------------------
'
'A1) Is s1 normalized to the default form (Form C)?: False
'A2) Is s1 normalized to Form C?: False
'A3) Is s1 normalized to Form D?: False
'A4) Is s1 normalized to Form KC?: False
'A5) Is s1 normalized to Form KD?: False
'
'--------------------------------------------------------------------------------
'
'Set string s2 to each normalized form of string s1.
'
'U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE
'U+0033 = DIGIT THREE
'U+2044 = FRACTION SLASH
'U+0034 = DIGIT FOUR
'
'--------------------------------------------------------------------------------
'
'B1) Is s2 normalized to the default form (Form C)?: True
'Characters in string s2 = 1E09 00BE
'
'B2) Is s2 normalized to Form C?: True
'Characters in string s2 = 1E09 00BE
'
'B3) Is s2 normalized to Form D?: True
'Characters in string s2 = 0063 0327 0301 00BE
'
'B4) Is s2 normalized to Form KC?: True
'Characters in string s2 = 1E09 0033 2044 0034
'
'B5) Is s2 normalized to Form KD?: True
'Characters in string s2 = 0063 0327 0301 0033 2044 0034
'
// This example demonstrates the String.Normalize method
// and the String.IsNormalized method
using System;
using System.Text;
class Sample
{
public static void Main()
{
// Character c; combining characters acute and cedilla; character 3/4
string s1 = new String( new char[] {'\u0063', '\u0301', '\u0327', '\u00BE'});
string s2 = null;
string divider = new String('-', 80);
divider = String.Concat(Environment.NewLine, divider, Environment.NewLine);
try
{
Show("s1", s1);
Console.WriteLine();
Console.WriteLine("U+0063 = LATIN SMALL LETTER C");
Console.WriteLine("U+0301 = COMBINING ACUTE ACCENT");
Console.WriteLine("U+0327 = COMBINING CEDILLA");
Console.WriteLine("U+00BE = VULGAR FRACTION THREE QUARTERS");
Console.WriteLine(divider);
Console.WriteLine("A1) Is s1 normalized to the default form (Form C)?: {0}",
s1.IsNormalized());
Console.WriteLine("A2) Is s1 normalized to Form C?: {0}",
s1.IsNormalized(NormalizationForm.FormC));
Console.WriteLine("A3) Is s1 normalized to Form D?: {0}",
s1.IsNormalized(NormalizationForm.FormD));
Console.WriteLine("A4) Is s1 normalized to Form KC?: {0}",
s1.IsNormalized(NormalizationForm.FormKC));
Console.WriteLine("A5) Is s1 normalized to Form KD?: {0}",
s1.IsNormalized(NormalizationForm.FormKD));
Console.WriteLine(divider);
Console.WriteLine("Set string s2 to each normalized form of string s1.");
Console.WriteLine();
Console.WriteLine("U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE");
Console.WriteLine("U+0033 = DIGIT THREE");
Console.WriteLine("U+2044 = FRACTION SLASH");
Console.WriteLine("U+0034 = DIGIT FOUR");
Console.WriteLine(divider);
s2 = s1.Normalize();
Console.Write("B1) Is s2 normalized to the default form (Form C)?: ");
Console.WriteLine(s2.IsNormalized());
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormC);
Console.Write("B2) Is s2 normalized to Form C?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormC));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormD);
Console.Write("B3) Is s2 normalized to Form D?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormD));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormKC);
Console.Write("B4) Is s2 normalized to Form KC?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKC));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormKD);
Console.Write("B5) Is s2 normalized to Form KD?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKD));
Show("s2", s2);
Console.WriteLine();
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
}
private static void Show(string title, string s)
{
Console.Write("Characters in string {0} = ", title);
foreach(short x in s.ToCharArray())
{
Console.Write("{0:X4} ", x);
}
Console.WriteLine();
}
}
/*
This example produces the following results:
Characters in string s1 = 0063 0301 0327 00BE
U+0063 = LATIN SMALL LETTER C
U+0301 = COMBINING ACUTE ACCENT
U+0327 = COMBINING CEDILLA
U+00BE = VULGAR FRACTION THREE QUARTERS
--------------------------------------------------------------------------------
A1) Is s1 normalized to the default form (Form C)?: False
A2) Is s1 normalized to Form C?: False
A3) Is s1 normalized to Form D?: False
A4) Is s1 normalized to Form KC?: False
A5) Is s1 normalized to Form KD?: False
--------------------------------------------------------------------------------
Set string s2 to each normalized form of string s1.
U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE
U+0033 = DIGIT THREE
U+2044 = FRACTION SLASH
U+0034 = DIGIT FOUR
--------------------------------------------------------------------------------
B1) Is s2 normalized to the default form (Form C)?: True
Characters in string s2 = 1E09 00BE
B2) Is s2 normalized to Form C?: True
Characters in string s2 = 1E09 00BE
B3) Is s2 normalized to Form D?: True
Characters in string s2 = 0063 0327 0301 00BE
B4) Is s2 normalized to Form KC?: True
Characters in string s2 = 1E09 0033 2044 0034
B5) Is s2 normalized to Form KD?: True
Characters in string s2 = 0063 0327 0301 0033 2044 0034
*/
// This example demonstrates the String.Normalize method
// and the String.IsNormalized method
using namespace System;
using namespace System::Text;
void Show( String^ title, String^ s )
{
Console::Write( "Characters in string {0} = ", title );
System::Collections::IEnumerator^ myEnum = s->ToCharArray()->GetEnumerator();
while ( myEnum->MoveNext() )
{
/*) * __try_cast < Char * > ( myEnum -> Current );*/
int x;
Console::Write( "{0:X4} ", x );
}
Console::WriteLine();
}
int main()
{
// Character c; combining characters acute and cedilla; character 3/4
array<Char>^temp0 = {L'c',L'\u0301',L'\u0327',L'\u00BE'};
String^ s1 = gcnew String( temp0 );
String^ s2 = nullptr;
String^ divider = gcnew String( '-',80 );
divider = String::Concat( Environment::NewLine, divider, Environment::NewLine );
try
{
Show( "s1", s1 );
Console::WriteLine();
Console::WriteLine( "U+0063 = LATIN SMALL LETTER C" );
Console::WriteLine( "U+0301 = COMBINING ACUTE ACCENT" );
Console::WriteLine( "U+0327 = COMBINING CEDILLA" );
Console::WriteLine( "U+00BE = VULGAR FRACTION THREE QUARTERS" );
Console::WriteLine( divider );
Console::WriteLine( "A1) Is s1 normalized to the default form (Form C)?: {0}", s1->IsNormalized() );
Console::WriteLine( "A2) Is s1 normalized to Form C?: {0}", s1->IsNormalized( NormalizationForm::FormC ) );
Console::WriteLine( "A3) Is s1 normalized to Form D?: {0}", s1->IsNormalized( NormalizationForm::FormD ) );
Console::WriteLine( "A4) Is s1 normalized to Form KC?: {0}", s1->IsNormalized( NormalizationForm::FormKC ) );
Console::WriteLine( "A5) Is s1 normalized to Form KD?: {0}", s1->IsNormalized( NormalizationForm::FormKD ) );
Console::WriteLine( divider );
Console::WriteLine( "Set string s2 to each normalized form of string s1." );
Console::WriteLine();
Console::WriteLine( "U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE" );
Console::WriteLine( "U+0033 = DIGIT THREE" );
Console::WriteLine( "U+2044 = FRACTION SLASH" );
Console::WriteLine( "U+0034 = DIGIT FOUR" );
Console::WriteLine( divider );
s2 = s1->Normalize();
Console::Write( "B1) Is s2 normalized to the default form (Form C)?: " );
Console::WriteLine( s2->IsNormalized() );
Show( "s2", s2 );
Console::WriteLine();
s2 = s1->Normalize( NormalizationForm::FormC );
Console::Write( "B2) Is s2 normalized to Form C?: " );
Console::WriteLine( s2->IsNormalized( NormalizationForm::FormC ) );
Show( "s2", s2 );
Console::WriteLine();
s2 = s1->Normalize( NormalizationForm::FormD );
Console::Write( "B3) Is s2 normalized to Form D?: " );
Console::WriteLine( s2->IsNormalized( NormalizationForm::FormD ) );
Show( "s2", s2 );
Console::WriteLine();
s2 = s1->Normalize( NormalizationForm::FormKC );
Console::Write( "B4) Is s2 normalized to Form KC?: " );
Console::WriteLine( s2->IsNormalized( NormalizationForm::FormKC ) );
Show( "s2", s2 );
Console::WriteLine();
s2 = s1->Normalize( NormalizationForm::FormKD );
Console::Write( "B5) Is s2 normalized to Form KD?: " );
Console::WriteLine( s2->IsNormalized( NormalizationForm::FormKD ) );
Show( "s2", s2 );
Console::WriteLine();
}
catch ( Exception^ e )
{
Console::WriteLine( e->Message );
}
}
/*
This example produces the following results:
Characters in string s1 = 0063 0301 0327 00BE
U+0063 = LATIN SMALL LETTER C
U+0301 = COMBINING ACUTE ACCENT
U+0327 = COMBINING CEDILLA
U+00BE = VULGAR FRACTION THREE QUARTERS
--------------------------------------------------------------------------------
A1) Is s1 normalized to the default form (Form C)?: False
A2) Is s1 normalized to Form C?: False
A3) Is s1 normalized to Form D?: False
A4) Is s1 normalized to Form KC?: False
A5) Is s1 normalized to Form KD?: False
--------------------------------------------------------------------------------
Set string s2 to each normalized form of string s1.
U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE
U+0033 = DIGIT THREE
U+2044 = FRACTION SLASH
U+0034 = DIGIT FOUR
--------------------------------------------------------------------------------
B1) Is s2 normalized to the default form (Form C)?: True
Characters in string s2 = 1E09 00BE
B2) Is s2 normalized to Form C?: True
Characters in string s2 = 1E09 00BE
B3) Is s2 normalized to Form D?: True
Characters in string s2 = 0063 0327 0301 00BE
B4) Is s2 normalized to Form KC?: True
Characters in string s2 = 1E09 0033 2044 0034
B5) Is s2 normalized to Form KD?: True
Characters in string s2 = 0063 0327 0301 0033 2044 0034
*/
// This example demonstrates the String.Normalize method
// and the String.IsNormalized method
import System.*;
import System.Text.*;
class Sample
{
public static void main(String[] args)
{
// Character c; combining characters acute and cedilla; character 3/4
String s1 = new String(new char[] { '\u0063', '\u0301', '\u0327',
'\u00BE' });
String s2 = null;
String divider = new String('-', 80);
divider = String.Concat(Environment.get_NewLine(), divider,
Environment.get_NewLine());
try {
Show("s1", s1);
Console.WriteLine();
Console.WriteLine("U+0063 = LATIN SMALL LETTER C");
Console.WriteLine("U+0301 = COMBINING ACUTE ACCENT");
Console.WriteLine("U+0327 = COMBINING CEDILLA");
Console.WriteLine("U+00BE = VULGAR FRACTION THREE QUARTERS");
Console.WriteLine(divider);
Console.WriteLine("A1) Is s1 normalized to the default form "
+ "(Form C)?: {0}", System.Convert.ToString(s1.IsNormalized()));
Console.WriteLine("A2) Is s1 normalized to Form C?: {0}",
System.Convert.ToString(s1.
IsNormalized(NormalizationForm.FormC)));
Console.WriteLine("A3) Is s1 normalized to Form D?: {0}",
System.Convert.ToString(s1.
IsNormalized(NormalizationForm.FormD)));
Console.WriteLine("A4) Is s1 normalized to Form KC?: {0}",
System.Convert.ToString(s1.
IsNormalized(NormalizationForm.FormKC)));
Console.WriteLine("A5) Is s1 normalized to Form KD?: {0}",
System.Convert.ToString(s1.
IsNormalized(NormalizationForm.FormKD)));
Console.WriteLine(divider);
Console.WriteLine("Set string s2 to each normalized form of "
+ "string s1.");
Console.WriteLine();
Console.WriteLine("U+1E09 = LATIN SMALL LETTER C WITH CEDILLA "
+ "AND ACUTE");
Console.WriteLine("U+0033 = DIGIT THREE");
Console.WriteLine("U+2044 = FRACTION SLASH");
Console.WriteLine("U+0034 = DIGIT FOUR");
Console.WriteLine(divider);
s2 = s1.Normalize();
Console.Write("B1) Is s2 normalized to the default form "
+ "(Form C)?: ");
Console.WriteLine(s2.IsNormalized());
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormC);
Console.Write("B2) Is s2 normalized to Form C?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormC));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormD);
Console.Write("B3) Is s2 normalized to Form D?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormD));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormKC);
Console.Write("B4) Is s2 normalized to Form KC?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKC));
Show("s2", s2);
Console.WriteLine();
s2 = s1.Normalize(NormalizationForm.FormKD);
Console.Write("B5) Is s2 normalized to Form KD?: ");
Console.WriteLine(s2.IsNormalized(NormalizationForm.FormKD));
Show("s2", s2);
Console.WriteLine();
}
catch (System.Exception e) {
Console.WriteLine(e.get_Message());
}
} //main
private static void Show(String title, String s)
{
Console.Write("Characters in string {0} = ", title);
char myCharArray[] = s.ToCharArray();
for (int iCtr = 0; iCtr < myCharArray.length; iCtr++) {
char c = myCharArray[iCtr];
Console.Write(((System.Int32)c).ToString("X4") + " ");
}
Console.WriteLine();
} //Show
} //Sample
/*
This example produces the following results:
Characters in string s1 = 0063 0301 0327 00BE
U+0063 = LATIN SMALL LETTER C
U+0301 = COMBINING ACUTE ACCENT
U+0327 = COMBINING CEDILLA
U+00BE = VULGAR FRACTION THREE QUARTERS
--------------------------------------------------------------------------------
A1) Is s1 normalized to the default form (Form C)?: False
A2) Is s1 normalized to Form C?: False
A3) Is s1 normalized to Form D?: False
A4) Is s1 normalized to Form KC?: False
A5) Is s1 normalized to Form KD?: False
--------------------------------------------------------------------------------
Set string s2 to each normalized form of string s1.
U+1E09 = LATIN SMALL LETTER C WITH CEDILLA AND ACUTE
U+0033 = DIGIT THREE
U+2044 = FRACTION SLASH
U+0034 = DIGIT FOUR
--------------------------------------------------------------------------------
B1) Is s2 normalized to the default form (Form C)?: True
Characters in string s2 = 1E09 00BE
B2) Is s2 normalized to Form C?: True
Characters in string s2 = 1E09 00BE
B3) Is s2 normalized to Form D?: True
Characters in string s2 = 0063 0327 0301 00BE
B4) Is s2 normalized to Form KC?: True
Characters in string s2 = 1E09 0033 2044 0034
B5) Is s2 normalized to Form KD?: True
Characters in string s2 = 0063 0327 0301 0033 2044 0034
*/