mirror of
https://github.com/vxunderground/VXUG-Papers.git
synced 2024-06-25 00:08:12 +00:00
372 lines
12 KiB
C++
372 lines
12 KiB
C++
#include <Windows.h>
|
|
#pragma warning(disable:6011)
|
|
|
|
|
|
#define CUDACALL __stdcall
|
|
typedef struct PCUDE_CONTEXT* CUDA_CONTEXT;
|
|
|
|
typedef INT(CUDACALL* CUDAMEMORYALLOCATE)(ULONG_PTR, SIZE_T);
|
|
typedef INT(CUDACALL* CUDAINIT)(INT);
|
|
typedef INT(CUDACALL* CUDAGETDEVICECOUNT)(PINT);
|
|
typedef INT(CUDACALL* CUDAGETDEVICE)(PINT, INT);
|
|
typedef INT(CUDACALL* CUDACREATECONTEXT)(CUDA_CONTEXT*, DWORD, INT);
|
|
typedef INT(CUDACALL* CUDADESTROYCONTEXT)(CUDA_CONTEXT*);
|
|
typedef INT(CUDACALL* CUDAMEMORYCOPYTODEVICE)(ULONG_PTR, PVOID, SIZE_T);
|
|
typedef INT(CUDACALL* CUDAMEMORYCOPYTOHOST)(PVOID, ULONG_PTR, SIZE_T);
|
|
typedef INT(CUDACALL* CUDAMEMORYFREE)(ULONG_PTR);
|
|
|
|
#define CUDA_SUCCESS 0
|
|
|
|
typedef struct _NVIDIA_API_TABLE {
|
|
HMODULE NvidiaLibary;
|
|
CUDAMEMORYALLOCATE CudaMemoryAllocate;
|
|
CUDAINIT CudaInit;
|
|
CUDAGETDEVICECOUNT CudaGetDeviceCount;
|
|
CUDAGETDEVICE CudaGetDevice;
|
|
CUDACREATECONTEXT CudaCreateContext;
|
|
CUDAMEMORYCOPYTODEVICE CudaMemoryCopyToDevice;
|
|
CUDAMEMORYCOPYTOHOST CudaMemoryCopyToHost;
|
|
CUDAMEMORYFREE CudaMemoryFree;
|
|
CUDADESTROYCONTEXT CudaDestroyContext;
|
|
} NVIDIA_API_TABLE, *PNVIDIA_API_TABLE;
|
|
|
|
SIZE_T StringLengthW(LPCWSTR String)
|
|
{
|
|
LPCWSTR String2;
|
|
|
|
for (String2 = String; *String2; ++String2);
|
|
|
|
return (String2 - String);
|
|
}
|
|
|
|
PWCHAR StringLocateCharW(PWCHAR String, INT Character)
|
|
{
|
|
do
|
|
{
|
|
if (*String == Character)
|
|
return (PWCHAR)String;
|
|
|
|
} while (*String++);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
INT StringCompareStringRegionW(PWCHAR String1, PWCHAR String2, SIZE_T Count)
|
|
{
|
|
UCHAR Block1, Block2;
|
|
while (Count-- > 0)
|
|
{
|
|
Block1 = (UCHAR)*String1++;
|
|
Block2 = (UCHAR)*String2++;
|
|
|
|
if (Block1 != Block2)
|
|
return Block1 - Block2;
|
|
|
|
if (Block1 == '\0')
|
|
return 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
PWCHAR StringFindSubstringW(PWCHAR String1, PWCHAR String2)
|
|
{
|
|
PWCHAR pPointer = String1;
|
|
DWORD Length = (DWORD)StringLengthW(String2);
|
|
|
|
for (; (pPointer = StringLocateCharW(pPointer, *String2)) != 0; pPointer++)
|
|
{
|
|
if (StringCompareStringRegionW(pPointer, String2, Length) == 0)
|
|
return (PWCHAR)pPointer;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
PWCHAR StringCopyW(PWCHAR String1, PWCHAR String2)
|
|
{
|
|
PWCHAR p = String1;
|
|
|
|
while ((*p++ = *String2++) != 0);
|
|
|
|
return String1;
|
|
}
|
|
|
|
PWCHAR StringConcatW(PWCHAR String, PWCHAR String2)
|
|
{
|
|
StringCopyW(&String[StringLengthW(String)], String2);
|
|
|
|
return String;
|
|
}
|
|
|
|
BOOL IsNvidiaGraphicsCardPresent(VOID)
|
|
{
|
|
DISPLAY_DEVICEW DisplayDevice; RtlZeroMemory(&DisplayDevice, sizeof(DISPLAY_DEVICEW));
|
|
DisplayDevice.cb = sizeof(DISPLAY_DEVICEW);
|
|
|
|
DWORD dwDeviceId = ERROR_SUCCESS;
|
|
|
|
while (EnumDisplayDevicesW(NULL, dwDeviceId, &DisplayDevice, 0))
|
|
{
|
|
if (StringFindSubstringW(DisplayDevice.DeviceString, (PWCHAR)L"NVIDIA") != NULL)
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
BOOL InitNvidiaCudaAPITable(PNVIDIA_API_TABLE Api)
|
|
{
|
|
Api->NvidiaLibary = LoadLibraryW(L"nvcuda.dll");
|
|
if (Api->NvidiaLibary == NULL)
|
|
return FALSE;
|
|
|
|
Api->CudaCreateContext = (CUDACREATECONTEXT)GetProcAddress(Api->NvidiaLibary, "cuCtxCreate_v2");
|
|
Api->CudaGetDevice = (CUDAGETDEVICE)GetProcAddress(Api->NvidiaLibary, "cuDeviceGet");
|
|
Api->CudaGetDeviceCount = (CUDAGETDEVICECOUNT)GetProcAddress(Api->NvidiaLibary, "cuDeviceGetCount");
|
|
Api->CudaInit = (CUDAINIT)GetProcAddress(Api->NvidiaLibary, "cuInit");
|
|
Api->CudaMemoryAllocate = (CUDAMEMORYALLOCATE)GetProcAddress(Api->NvidiaLibary, "cuMemAlloc_v2");
|
|
Api->CudaMemoryCopyToDevice = (CUDAMEMORYCOPYTODEVICE)GetProcAddress(Api->NvidiaLibary, "cuMemcpyHtoD_v2");
|
|
Api->CudaMemoryCopyToHost = (CUDAMEMORYCOPYTOHOST)GetProcAddress(Api->NvidiaLibary, "cuMemcpyDtoH_v2");
|
|
Api->CudaMemoryFree = (CUDAMEMORYFREE)GetProcAddress(Api->NvidiaLibary, "cuMemFree_v2");
|
|
Api->CudaDestroyContext = (CUDADESTROYCONTEXT)GetProcAddress(Api->NvidiaLibary, "cuCtxDestroy");
|
|
|
|
if (!Api->CudaCreateContext || !Api->CudaGetDevice || !Api->CudaGetDeviceCount || !Api->CudaInit || !Api->CudaDestroyContext)
|
|
return FALSE;
|
|
|
|
if (!Api->CudaMemoryAllocate || !Api->CudaMemoryCopyToDevice || !Api->CudaMemoryCopyToHost || !Api->CudaMemoryFree)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
ULONG_PTR RtlAllocateGpuMemory(PNVIDIA_API_TABLE Api, DWORD ByteSize)
|
|
{
|
|
ULONG_PTR GpuBufferPointer = NULL;
|
|
|
|
if (ByteSize == 0)
|
|
return NULL;
|
|
|
|
if (Api->CudaMemoryAllocate((ULONG_PTR )&GpuBufferPointer, ByteSize) != CUDA_SUCCESS)
|
|
return NULL;
|
|
|
|
return GpuBufferPointer;
|
|
|
|
}
|
|
|
|
INT main(VOID)
|
|
{
|
|
/********************************************************************
|
|
* Variables
|
|
********************************************************************/
|
|
|
|
//Application variables
|
|
DWORD dwError = ERROR_SUCCESS;
|
|
BOOL bFlag = FALSE;
|
|
|
|
//NVIDIA related variables
|
|
NVIDIA_API_TABLE Api = { 0 };
|
|
INT DeviceCount = 0;
|
|
INT Device = 0;
|
|
CUDA_CONTEXT Context = NULL;;
|
|
ULONG_PTR GpuMemory = NULL;
|
|
|
|
//Subroutine related variables, unimportant to proof-of-concept
|
|
WCHAR BinaryPath[MAX_PATH * sizeof(WCHAR)] = { 0 };
|
|
HANDLE hHandle = INVALID_HANDLE_VALUE;
|
|
PBYTE DataBuffer = NULL;
|
|
HANDLE ProcessHeap = GetProcessHeap();
|
|
DWORD dwRead = 0;
|
|
|
|
/********************************************************************
|
|
* Start
|
|
*********************************************************************
|
|
*
|
|
* IsNvidiaGraphicsCardPresent() invokes the EnumDisplayDevicesW and
|
|
* performs a string comparison on DISPLAY_DEVICEW member
|
|
* DeviceString to look for the presence of NVIDIA. If NVIDIA string
|
|
* is present IsNvidiaGraphicsCardPresent() returns true else false.
|
|
* If IsNvidiaGraphicsCardPresent() returns false the application
|
|
* terminates. This proof-of-concept is NVIDIA specific.
|
|
*
|
|
* If IsNvidiaGraphicsCardPresent() returns true, we make a
|
|
* subsequent function call to InitNvidiaCudaAPITable().
|
|
* InitNvidiaCudaAPITable() populates a NVIDIA_API_TABLE structure
|
|
* whose members are function pointers to NVIDIA-related APIs omit
|
|
* the NvidiaLibrary HMODULE member whose value is returned from a
|
|
* call to LoadLibrary.
|
|
*
|
|
* Anyway, enjoy the proof-of-concept.
|
|
* With love, smelly__vx
|
|
* vx-underground.org
|
|
*
|
|
********************************************************************/
|
|
|
|
if (!IsNvidiaGraphicsCardPresent())
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (!InitNvidiaCudaAPITable(&Api))
|
|
goto EXIT_ROUTINE;
|
|
|
|
/********************************************************************
|
|
* Unimportant section
|
|
*********************************************************************
|
|
*
|
|
* This section performs trivial tasks unrelated to the core
|
|
* concept being illustrated in this proof of concept. The code
|
|
* below assembles a path to the desktop by using
|
|
* GetEnvironmentVariableW and appending a hardcoded path. The hard
|
|
* coded path is a binary present on the desktop (must be created
|
|
* by the user).
|
|
*
|
|
* Once creating the desktop path in memory, we invoke CreateFile
|
|
* and get a handle to file on the desktop. Subsequently this code
|
|
* gets the file size, allocates it in memory, and reads the content
|
|
* into member (type PBYTE).
|
|
*
|
|
********************************************************************/
|
|
|
|
if (GetEnvironmentVariableW(L"USERPROFILE", BinaryPath, (MAX_PATH * sizeof(WCHAR))) == 0)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (StringConcatW(BinaryPath, (PWCHAR)L"\\Desktop\\Demo.txt") == NULL)
|
|
goto EXIT_ROUTINE;
|
|
|
|
hHandle = CreateFileW(BinaryPath, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
if (hHandle == INVALID_HANDLE_VALUE)
|
|
goto EXIT_ROUTINE;
|
|
|
|
dwError = GetFileSize(hHandle, NULL);
|
|
if (dwError == INVALID_FILE_SIZE)
|
|
goto EXIT_ROUTINE;
|
|
|
|
DataBuffer = (PBYTE)HeapAlloc(ProcessHeap, HEAP_ZERO_MEMORY, dwError);
|
|
if (DataBuffer == NULL)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (!ReadFile(hHandle, DataBuffer, dwError, &dwRead, NULL))
|
|
goto EXIT_ROUTINE;
|
|
|
|
dwError = ERROR_SUCCESS;
|
|
|
|
/********************************************************************
|
|
* Unimportant code segment end
|
|
********************************************************************/
|
|
|
|
/********************************************************************
|
|
* NVIDIA CUDA Code segment
|
|
*********************************************************************
|
|
*
|
|
* The code below copies content from the data file read off the
|
|
* desktop onto the GPU, frees the heap, then copies the data back
|
|
*
|
|
* CUDA_SUCCESS is defined as 0 (zero). Successful CUDA invocations
|
|
* return 0 (zero).
|
|
*
|
|
* CudaInit(0) is required to initialize NVIDIA API interface. The
|
|
* subsequent function calls, CudaGetDeviceCount, and
|
|
* CudaGetDevice are generic NVIDIA functions to enumerating and
|
|
* identifying GPU devices - in the unlikely event more than one
|
|
* GPU is present.
|
|
*
|
|
* The handle returned by CudaGetDevice is used to create a CUDA
|
|
* context - the state of the application, how two consecutive
|
|
* API calls are related to each other while utilizing the CUDA
|
|
* runtime API.
|
|
*
|
|
* This code forwards to the NVIDIA driver.
|
|
*
|
|
* After a context is successfully created - we allocate memory
|
|
* on the GPU via our wrapper function, RtlAllocateGpuMemory, which
|
|
* returns a ULONG_PTR, a pointer to the allocated GPU memory.
|
|
* The calls that proceed this first copy data to the GPU, and
|
|
* finally, retrive the data back. This is illustrated by this
|
|
* proof-of-concept freeing the heap which contained data which was
|
|
* previously stored in the DataBuffer variable.
|
|
*
|
|
********************************************************************/
|
|
|
|
if (Api.CudaInit(0) != CUDA_SUCCESS)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (Api.CudaGetDeviceCount(&DeviceCount) != CUDA_SUCCESS || DeviceCount == 0)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (Api.CudaGetDevice(&Device, DeviceCount - 1) != CUDA_SUCCESS)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (Api.CudaCreateContext(&Context, 0, Device) != CUDA_SUCCESS)
|
|
goto EXIT_ROUTINE;
|
|
|
|
GpuMemory = RtlAllocateGpuMemory(&Api, dwRead);
|
|
if (GpuMemory == NULL)
|
|
goto EXIT_ROUTINE;
|
|
|
|
if (Api.CudaMemoryCopyToDevice(GpuMemory, DataBuffer, dwRead) != CUDA_SUCCESS)
|
|
goto EXIT_ROUTINE;
|
|
|
|
/********************************************************************
|
|
* Unimportant section
|
|
*********************************************************************
|
|
*
|
|
* Frees the heap, to illustrate the DataBuffer variable is indeed
|
|
* empty, containing no data. We then reallocate the buffer and
|
|
* copy the contents from the GPU back onto the hosts heap.
|
|
*
|
|
********************************************************************/
|
|
if (DataBuffer)
|
|
HeapFree(ProcessHeap, HEAP_ZERO_MEMORY, DataBuffer);
|
|
|
|
Sleep(1000);
|
|
|
|
DataBuffer = (PBYTE)HeapAlloc(ProcessHeap, HEAP_ZERO_MEMORY, dwRead);
|
|
if (DataBuffer == NULL)
|
|
goto EXIT_ROUTINE;
|
|
|
|
/********************************************************************
|
|
* Unimportant code segment end
|
|
********************************************************************/
|
|
|
|
if (Api.CudaMemoryCopyToHost(DataBuffer, GpuMemory, dwRead) != CUDA_SUCCESS)
|
|
goto EXIT_ROUTINE;
|
|
|
|
/********************************************************************
|
|
* EXIT_ROUTINE
|
|
*********************************************************************
|
|
*
|
|
* bFlag is a generic variable. Its value by default, FALSE,
|
|
* indicates whether or not the application has entered the
|
|
* exit routine normally. If the code has finished execution, then
|
|
* bFlag will be set to TRUE indicating there has been no issues.
|
|
* Otherwise, bFlag being FALSE indicates failure and GetLastError
|
|
* is invoked.
|
|
*
|
|
* Regardless of failure, each variable is checked to determine if
|
|
* it needs to be freed or unloaded.
|
|
*
|
|
********************************************************************/
|
|
|
|
bFlag = TRUE;
|
|
|
|
EXIT_ROUTINE:
|
|
|
|
if (!bFlag)
|
|
dwError = GetLastError();
|
|
|
|
if (DataBuffer)
|
|
HeapFree(ProcessHeap, HEAP_ZERO_MEMORY, DataBuffer);
|
|
|
|
if (GpuMemory != NULL)
|
|
Api.CudaMemoryFree(GpuMemory);
|
|
|
|
if (Context != NULL)
|
|
Api.CudaDestroyContext(&Context);
|
|
|
|
if (Api.NvidiaLibary)
|
|
FreeLibrary(Api.NvidiaLibary);
|
|
|
|
if (hHandle)
|
|
CloseHandle(hHandle);
|
|
|
|
return dwError;
|
|
} |